From afd38f0c617d6f89b2b4532c6c44f116617e2b6f Mon Sep 17 00:00:00 2001 From: Felix Thomasmathibalan Date: Wed, 27 Sep 2023 17:46:17 +0100 Subject: Apply clang-format on repository Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- src/c/AclContext.cpp | 21 +- src/c/AclQueue.cpp | 6 +- src/c/AclTensor.cpp | 29 +- src/c/AclTensorPack.cpp | 10 +- src/c/AclVersion.cpp | 3 +- src/c/cl/AclOpenClExt.cpp | 45 +- src/common/AllocatorWrapper.cpp | 3 +- src/common/AllocatorWrapper.h | 6 +- src/common/IContext.h | 13 +- src/common/IOperator.cpp | 4 +- src/common/IOperator.h | 7 +- src/common/IQueue.h | 4 +- src/common/ITensorV2.cpp | 4 +- src/common/ITensorV2.h | 7 +- src/common/TensorPack.cpp | 4 +- src/common/TensorPack.h | 5 +- src/common/cpuinfo/CpuInfo.cpp | 106 +- src/common/cpuinfo/CpuIsaInfo.cpp | 49 +- src/common/cpuinfo/CpuIsaInfo.h | 27 +- src/common/cpuinfo/CpuModel.cpp | 34 +- src/common/cpuinfo/CpuModel.h | 4 +- src/common/utils/LegacySupport.cpp | 25 +- src/common/utils/Log.h | 57 +- src/common/utils/Macros.h | 2 +- src/common/utils/Object.h | 8 +- src/common/utils/Utils.h | 5 +- src/common/utils/Validate.h | 2 +- src/core/AccessWindowAutoPadding.cpp | 16 +- src/core/AccessWindowAutoPadding.h | 9 +- src/core/AccessWindowStatic.cpp | 45 +- src/core/AccessWindowStatic.h | 9 +- src/core/AccessWindowTranspose.cpp | 54 +- src/core/AccessWindowTranspose.h | 5 +- src/core/CL/CLCommandBuffer.cpp | 2 +- src/core/CL/CLCommandBuffer.h | 5 +- src/core/CL/CLCompatCommandBuffer.cpp | 32 +- src/core/CL/CLCompatCommandBuffer.h | 5 +- src/core/CL/CLCompileContext.cpp | 91 +- src/core/CL/CLHelpers.cpp | 93 +- src/core/CL/CLKernelLibrary.cpp | 14 +- src/core/CL/CLMutableCommandBuffer.cpp | 36 +- src/core/CL/CLMutableCommandBuffer.h | 5 +- src/core/CL/CLUtils.cpp | 35 +- src/core/CL/CLUtils.h | 7 +- src/core/CL/CLValidate.h | 18 +- src/core/CL/DefaultLWSHeuristics.cpp | 14 +- src/core/CL/ICLKernel.cpp | 32 +- src/core/CL/ICLKernel.h | 60 +- src/core/CL/ICLSimple2DKernel.cpp | 3 +- src/core/CL/ICLSimple2DKernel.h | 2 +- src/core/CL/ICLSimple3DKernel.cpp | 3 +- src/core/CL/ICLSimple3DKernel.h | 2 +- src/core/CL/ICLSimpleKernel.cpp | 17 +- src/core/CL/ICLSimpleKernel.h | 9 +- src/core/CL/ICLTensor.cpp | 3 +- src/core/CL/OpenCL.cpp | 549 +++--- src/core/CL/cl_kernels/activation_float_helpers.h | 13 +- src/core/CL/cl_kernels/activation_quant_helpers.h | 15 +- src/core/CL/cl_kernels/gemm_helpers.h | 252 +-- src/core/CL/cl_kernels/helpers.h | 817 ++++---- src/core/CL/cl_kernels/helpers_asymm.h | 337 ++-- src/core/CL/cl_kernels/load_store_utility.h | 73 +- src/core/CL/cl_kernels/repeat.h | 42 +- src/core/CL/cl_kernels/warp_helpers.h | 59 +- src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp | 80 +- src/core/CL/kernels/CLArgMinMaxLayerKernel.h | 10 +- .../CL/kernels/CLBatchNormalizationLayerKernel.cpp | 125 +- .../CL/kernels/CLBatchNormalizationLayerKernel.h | 32 +- src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp | 76 +- src/core/CL/kernels/CLBatchToSpaceLayerKernel.h | 25 +- src/core/CL/kernels/CLBitwiseKernel.cpp | 25 +- src/core/CL/kernels/CLBitwiseKernel.h | 6 +- .../CL/kernels/CLBoundingBoxTransformKernel.cpp | 42 +- src/core/CL/kernels/CLBoundingBoxTransformKernel.h | 16 +- .../CL/kernels/CLChannelShuffleLayerKernel.cpp | 64 +- src/core/CL/kernels/CLChannelShuffleLayerKernel.h | 5 +- src/core/CL/kernels/CLComparisonKernel.cpp | 75 +- src/core/CL/kernels/CLComparisonKernel.h | 14 +- .../kernels/CLDeconvolutionLayerUpsampleKernel.cpp | 25 +- .../kernels/CLDeconvolutionLayerUpsampleKernel.h | 5 +- .../kernels/CLDeconvolutionReshapeOutputKernel.cpp | 84 +- .../kernels/CLDeconvolutionReshapeOutputKernel.h | 23 +- src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp | 29 +- src/core/CL/kernels/CLDepthToSpaceLayerKernel.h | 4 +- .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 199 +- .../CLDepthwiseConvolutionLayerNativeKernel.h | 45 +- src/core/CL/kernels/CLFFTDigitReverseKernel.cpp | 42 +- src/core/CL/kernels/CLFFTDigitReverseKernel.h | 18 +- src/core/CL/kernels/CLFFTRadixStageKernel.cpp | 46 +- src/core/CL/kernels/CLFFTRadixStageKernel.h | 9 +- src/core/CL/kernels/CLFFTScaleKernel.cpp | 26 +- src/core/CL/kernels/CLFFTScaleKernel.h | 9 +- src/core/CL/kernels/CLFillBorderKernel.cpp | 59 +- src/core/CL/kernels/CLFillBorderKernel.h | 18 +- .../CL/kernels/CLFuseBatchNormalizationKernel.cpp | 129 +- .../CL/kernels/CLFuseBatchNormalizationKernel.h | 41 +- src/core/CL/kernels/CLGatherKernel.cpp | 36 +- src/core/CL/kernels/CLGatherKernel.h | 10 +- .../CL/kernels/CLGenerateProposalsLayerKernel.cpp | 33 +- .../CL/kernels/CLGenerateProposalsLayerKernel.h | 7 +- .../kernels/CLInstanceNormalizationLayerKernel.cpp | 54 +- .../kernels/CLInstanceNormalizationLayerKernel.h | 16 +- src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp | 45 +- src/core/CL/kernels/CLL2NormalizeLayerKernel.h | 11 +- src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp | 45 +- src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h | 11 +- .../CL/kernels/CLMeanStdDevNormalizationKernel.cpp | 19 +- .../CL/kernels/CLMeanStdDevNormalizationKernel.h | 5 +- src/core/CL/kernels/CLNormalizationLayerKernel.cpp | 72 +- src/core/CL/kernels/CLNormalizationLayerKernel.h | 7 +- .../CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp | 56 +- .../CL/kernels/CLNormalizePlanarYUVLayerKernel.h | 9 +- src/core/CL/kernels/CLPadLayerKernel.cpp | 95 +- src/core/CL/kernels/CLPadLayerKernel.h | 20 +- src/core/CL/kernels/CLPriorBoxLayerKernel.cpp | 83 +- src/core/CL/kernels/CLPriorBoxLayerKernel.h | 27 +- .../CL/kernels/CLQLSTMLayerNormalizationKernel.cpp | 47 +- .../CL/kernels/CLQLSTMLayerNormalizationKernel.h | 9 +- src/core/CL/kernels/CLROIAlignLayerKernel.cpp | 51 +- src/core/CL/kernels/CLROIAlignLayerKernel.h | 14 +- src/core/CL/kernels/CLROIPoolingLayerKernel.cpp | 38 +- src/core/CL/kernels/CLROIPoolingLayerKernel.h | 14 +- src/core/CL/kernels/CLRangeKernel.cpp | 38 +- src/core/CL/kernels/CLRangeKernel.h | 1 + src/core/CL/kernels/CLReductionOperationKernel.cpp | 103 +- src/core/CL/kernels/CLReductionOperationKernel.h | 10 +- src/core/CL/kernels/CLReorgLayerKernel.cpp | 41 +- src/core/CL/kernels/CLReorgLayerKernel.h | 1 + src/core/CL/kernels/CLReverseKernel.cpp | 16 +- src/core/CL/kernels/CLReverseKernel.h | 5 +- src/core/CL/kernels/CLSelectKernel.cpp | 33 +- src/core/CL/kernels/CLSelectKernel.h | 7 +- src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp | 100 +- src/core/CL/kernels/CLSpaceToBatchLayerKernel.h | 35 +- src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp | 23 +- src/core/CL/kernels/CLSpaceToDepthLayerKernel.h | 4 +- src/core/CL/kernels/CLStackLayerKernel.cpp | 38 +- src/core/CL/kernels/CLStackLayerKernel.h | 17 +- src/core/CL/kernels/CLStridedSliceKernel.cpp | 101 +- src/core/CL/kernels/CLStridedSliceKernel.h | 24 +- src/core/CL/kernels/CLTileKernel.cpp | 30 +- src/core/CL/kernels/CLTileKernel.h | 5 +- src/core/CPP/CPPTypes.cpp | 4 +- src/core/CPP/Validate.h | 26 +- .../CPPBoxWithNonMaximaSuppressionLimitKernel.cpp | 171 +- .../CPP/kernels/CPPNonMaximumSuppressionKernel.cpp | 110 +- src/core/CPP/kernels/CPPPermuteKernel.cpp | 45 +- src/core/CPP/kernels/CPPTopKVKernel.cpp | 43 +- src/core/CPP/kernels/CPPUpsampleKernel.cpp | 17 +- src/core/Error.cpp | 5 +- src/core/GPUTarget.cpp | 97 +- src/core/Helpers.cpp | 27 +- src/core/IAccessWindow.cpp | 79 +- src/core/IKernel.cpp | 3 +- src/core/ITensor.cpp | 34 +- src/core/ITensorPack.cpp | 9 +- src/core/NEON/NEAsymm.h | 308 ++- src/core/NEON/NEAsymm.inl | 10 +- src/core/NEON/NEFixedPoint.inl | 8 +- src/core/NEON/NEMath.inl | 105 +- src/core/NEON/NESymm.h | 95 +- src/core/NEON/SVEAsymm.h | 47 +- src/core/NEON/SVEMath.h | 8 +- src/core/NEON/SVEMath.inl | 70 +- src/core/NEON/SVESymm.h | 23 +- .../kernels/NEBatchNormalizationLayerKernel.cpp | 302 +-- .../NEON/kernels/NEBatchNormalizationLayerKernel.h | 21 +- .../NEON/kernels/NEBatchToSpaceLayerKernel.cpp | 127 +- src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h | 13 +- src/core/NEON/kernels/NEBitwiseAndKernel.cpp | 17 +- src/core/NEON/kernels/NEBitwiseNotKernel.cpp | 14 +- src/core/NEON/kernels/NEBitwiseOrKernel.cpp | 18 +- src/core/NEON/kernels/NEBitwiseXorKernel.cpp | 18 +- .../NEON/kernels/NEBoundingBoxTransformKernel.cpp | 68 +- .../NEON/kernels/NEBoundingBoxTransformKernel.h | 8 +- .../NEON/kernels/NEChannelShuffleLayerKernel.cpp | 97 +- src/core/NEON/kernels/NECol2ImKernel.h | 4 +- src/core/NEON/kernels/NECropKernel.cpp | 238 +-- src/core/NEON/kernels/NECropKernel.h | 19 +- .../NEON/kernels/NEDepthToSpaceLayerKernel.cpp | 76 +- src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp | 149 +- src/core/NEON/kernels/NEFFTDigitReverseKernel.h | 6 +- src/core/NEON/kernels/NEFFTRadixStageKernel.cpp | 594 +++--- src/core/NEON/kernels/NEFFTRadixStageKernel.h | 14 +- src/core/NEON/kernels/NEFFTScaleKernel.cpp | 21 +- src/core/NEON/kernels/NEFFTScaleKernel.h | 4 +- src/core/NEON/kernels/NEFillBorderKernel.cpp | 225 ++- src/core/NEON/kernels/NEFillBorderKernel.h | 11 +- .../kernels/NEFuseBatchNormalizationKernel.cpp | 244 +-- .../NEON/kernels/NEFuseBatchNormalizationKernel.h | 39 +- src/core/NEON/kernels/NEGatherKernel.cpp | 80 +- src/core/NEON/kernels/NEGatherKernel.h | 5 +- .../kernels/NEGenerateProposalsLayerKernel.cpp | 48 +- .../NEON/kernels/NEGenerateProposalsLayerKernel.h | 2 +- .../kernels/NEInstanceNormalizationLayerKernel.cpp | 57 +- .../kernels/NEInstanceNormalizationLayerKernel.h | 8 +- src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp | 59 +- src/core/NEON/kernels/NEL2NormalizeLayerKernel.h | 3 +- src/core/NEON/kernels/NELogicalKernel.cpp | 91 +- src/core/NEON/kernels/NELogicalKernel.h | 5 +- .../kernels/NEMeanStdDevNormalizationKernel.cpp | 54 +- .../NEON/kernels/NENormalizationLayerKernel.cpp | 144 +- src/core/NEON/kernels/NENormalizationLayerKernel.h | 8 +- src/core/NEON/kernels/NEPadLayerKernel.cpp | 106 +- src/core/NEON/kernels/NEPadLayerKernel.h | 13 +- src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp | 166 +- src/core/NEON/kernels/NEPriorBoxLayerKernel.h | 14 +- .../kernels/NEQLSTMLayerNormalizationKernel.cpp | 118 +- .../NEON/kernels/NEQLSTMLayerNormalizationKernel.h | 33 +- src/core/NEON/kernels/NEROIAlignLayerKernel.cpp | 79 +- src/core/NEON/kernels/NEROIAlignLayerKernel.h | 5 +- src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp | 85 +- src/core/NEON/kernels/NEROIPoolingLayerKernel.h | 8 +- src/core/NEON/kernels/NERangeKernel.cpp | 90 +- src/core/NEON/kernels/NERangeKernel.h | 1 + .../NEON/kernels/NEReductionOperationKernel.cpp | 1955 ++++++++++---------- src/core/NEON/kernels/NEReductionOperationKernel.h | 3 +- src/core/NEON/kernels/NEReorderKernel.cpp | 70 +- src/core/NEON/kernels/NEReorderKernel.h | 33 +- src/core/NEON/kernels/NEReorgLayerKernel.cpp | 56 +- src/core/NEON/kernels/NEReverseKernel.cpp | 98 +- src/core/NEON/kernels/NEReverseKernel.h | 3 +- src/core/NEON/kernels/NESelectKernel.cpp | 156 +- src/core/NEON/kernels/NESelectKernel.h | 2 +- .../NEON/kernels/NESpaceToBatchLayerKernel.cpp | 161 +- src/core/NEON/kernels/NESpaceToBatchLayerKernel.h | 20 +- .../NEON/kernels/NESpaceToDepthLayerKernel.cpp | 59 +- src/core/NEON/kernels/NESpaceToDepthLayerKernel.h | 1 + src/core/NEON/kernels/NEStackLayerKernel.cpp | 55 +- src/core/NEON/kernels/NEStackLayerKernel.h | 10 +- src/core/NEON/kernels/NEStridedSliceKernel.cpp | 115 +- src/core/NEON/kernels/NEStridedSliceKernel.h | 23 +- src/core/NEON/kernels/NETileKernel.cpp | 47 +- src/core/NEON/kernels/assembly/depthwise.hpp | 270 +-- .../NEON/kernels/assembly/depthwise_common.hpp | 106 +- src/core/NEON/kernels/assembly/pool_common.hpp | 71 +- src/core/NEON/kernels/assembly/pooling.hpp | 210 ++- src/core/NEON/kernels/assembly/premultiply.hpp | 17 +- src/core/NEON/kernels/assembly/winograd.hpp | 181 +- .../kernels/batchnormalization/impl/NEON/fp16.cpp | 166 +- .../kernels/batchnormalization/impl/NEON/fp32.cpp | 166 +- .../kernels/batchnormalization/impl/SVE/fp16.cpp | 115 +- .../kernels/batchnormalization/impl/SVE/fp32.cpp | 115 +- .../NEON/kernels/batchnormalization/impl/list.h | 6 +- .../kernels/detail/NEActivationFunctionDetail.h | 7 +- .../NEON/kernels/detail/NEColorConvertHelper.inl | 735 ++++---- .../NEON/kernels/detail/NEDirectConvolution3x3.h | 80 +- .../kernels/detail/NEDirectConvolutionDetail.h | 507 ++--- src/core/NEON/wrapper/intrinsics/cvt.h | 47 +- src/core/NEON/wrapper/intrinsics/div.h | 1 + src/core/NEON/wrapper/intrinsics/erf.h | 1 + src/core/NEON/wrapper/intrinsics/exp.h | 1 + src/core/NEON/wrapper/intrinsics/getlane.h | 14 +- src/core/NEON/wrapper/intrinsics/inv.h | 1 + src/core/NEON/wrapper/intrinsics/invsqrt.h | 1 + src/core/NEON/wrapper/intrinsics/log.h | 1 + src/core/NEON/wrapper/intrinsics/pow.h | 1 + src/core/NEON/wrapper/intrinsics/qmov.h | 6 +- src/core/NEON/wrapper/intrinsics/reinterpret.h | 2 +- src/core/NEON/wrapper/intrinsics/round.h | 1 + src/core/NEON/wrapper/intrinsics/setlane.h | 12 +- src/core/NEON/wrapper/intrinsics/shr.h | 4 +- src/core/NEON/wrapper/intrinsics/sin.h | 3 +- src/core/NEON/wrapper/intrinsics/svcnt.h | 4 +- src/core/NEON/wrapper/intrinsics/svcvt.h | 35 +- src/core/NEON/wrapper/intrinsics/svexp.h | 3 +- src/core/NEON/wrapper/intrinsics/svlog.h | 3 +- src/core/NEON/wrapper/intrinsics/svptrue.h | 4 +- src/core/NEON/wrapper/intrinsics/svwhilelt.h | 4 +- src/core/NEON/wrapper/intrinsics/tanh.h | 1 + src/core/NEON/wrapper/scalar/add.h | 12 +- src/core/NEON/wrapper/scalar/sub.h | 12 +- src/core/NEON/wrapper/svtraits.h | 1 + src/core/Rounding.cpp | 7 +- src/core/Size2D.cpp | 3 +- src/core/Size3D.cpp | 6 +- src/core/SubTensorInfo.cpp | 38 +- src/core/TensorInfo.cpp | 99 +- src/core/Utils.cpp | 260 +-- src/core/Validate.cpp | 115 +- src/core/common/Macros.h | 4 +- src/core/common/Registrars.h | 12 +- src/core/helpers/AutoConfiguration.h | 21 +- src/core/helpers/MemoryHelpers.h | 61 +- src/core/helpers/PoolingHelpers.h | 101 +- src/core/helpers/ScaleHelpers.h | 23 +- src/core/helpers/SoftmaxHelpers.cpp | 2 +- src/core/helpers/Utils.cpp | 4 +- src/core/helpers/Utils.h | 2 +- src/core/helpers/WindowHelpers.cpp | 163 +- src/core/helpers/WindowHelpers.h | 57 +- src/core/utils/ActivationFunctionUtils.cpp | 36 +- src/core/utils/AssemblyUtils.cpp | 14 +- src/core/utils/AssemblyUtils.h | 3 +- src/core/utils/DataLayoutUtils.cpp | 9 +- src/core/utils/DataTypeUtils.cpp | 54 +- src/core/utils/FormatUtils.cpp | 30 +- src/core/utils/InterpolationPolicyUtils.cpp | 9 +- src/core/utils/ScaleUtils.cpp | 15 +- src/core/utils/ScaleUtils.h | 7 +- src/core/utils/StringUtils.cpp | 16 +- src/core/utils/helpers/fft.cpp | 19 +- src/core/utils/helpers/float_ops.h | 3 +- src/core/utils/helpers/tensor_info.h | 14 +- src/core/utils/helpers/tensor_transform.cpp | 63 +- src/core/utils/io/FileHandler.cpp | 7 +- src/core/utils/logging/FilePrinter.cpp | 5 +- src/core/utils/logging/Helpers.cpp | 13 +- src/core/utils/logging/Logger.cpp | 17 +- src/core/utils/logging/LoggerRegistry.cpp | 18 +- src/core/utils/misc/MMappedFile.cpp | 26 +- src/core/utils/quantization/AsymmHelpers.cpp | 72 +- src/core/utils/quantization/AsymmHelpers.h | 7 +- src/cpu/CpuContext.cpp | 28 +- src/cpu/CpuContext.h | 12 +- src/cpu/CpuQueue.cpp | 3 +- src/cpu/CpuQueue.h | 4 +- src/cpu/CpuTensor.cpp | 5 +- src/cpu/CpuTensor.h | 8 +- src/cpu/CpuTypes.h | 2 +- src/cpu/ICpuKernel.h | 13 +- src/cpu/kernels/CpuActivationKernel.cpp | 246 +-- src/cpu/kernels/CpuActivationKernel.h | 10 +- src/cpu/kernels/CpuAddKernel.cpp | 233 +-- src/cpu/kernels/CpuAddKernel.h | 12 +- src/cpu/kernels/CpuAddMulAddKernel.cpp | 99 +- src/cpu/kernels/CpuAddMulAddKernel.h | 40 +- src/cpu/kernels/CpuCastKernel.cpp | 1346 +++++++------- src/cpu/kernels/CpuCastKernel.h | 7 +- src/cpu/kernels/CpuCol2ImKernel.cpp | 27 +- src/cpu/kernels/CpuCol2ImKernel.h | 3 +- src/cpu/kernels/CpuConcatenateBatchKernel.cpp | 126 +- src/cpu/kernels/CpuConcatenateBatchKernel.h | 6 +- src/cpu/kernels/CpuConcatenateDepthKernel.cpp | 126 +- src/cpu/kernels/CpuConcatenateDepthKernel.h | 6 +- src/cpu/kernels/CpuConcatenateHeightKernel.cpp | 120 +- src/cpu/kernels/CpuConcatenateHeightKernel.h | 4 +- src/cpu/kernels/CpuConcatenateWidthKernel.cpp | 118 +- src/cpu/kernels/CpuConcatenateWidthKernel.h | 4 +- .../CpuConvertFullyConnectedWeightsKernel.cpp | 32 +- .../CpuConvertFullyConnectedWeightsKernel.h | 20 +- .../CpuConvertQuantizedSignednessKernel.cpp | 55 +- .../kernels/CpuConvertQuantizedSignednessKernel.h | 2 +- src/cpu/kernels/CpuCopyKernel.cpp | 61 +- src/cpu/kernels/CpuCopyKernel.h | 2 +- src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp | 130 +- src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h | 24 +- src/cpu/kernels/CpuDequantizeKernel.cpp | 234 +-- src/cpu/kernels/CpuDequantizeKernel.h | 2 +- src/cpu/kernels/CpuDirectConv2dKernel.cpp | 62 +- src/cpu/kernels/CpuDirectConv2dKernel.h | 14 +- .../kernels/CpuDirectConv2dOutputStageKernel.cpp | 388 ++-- src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h | 34 +- src/cpu/kernels/CpuDirectConv3dKernel.cpp | 87 +- src/cpu/kernels/CpuDirectConv3dKernel.h | 20 +- src/cpu/kernels/CpuElementwiseKernel.cpp | 471 +++-- src/cpu/kernels/CpuElementwiseKernel.h | 13 +- src/cpu/kernels/CpuElementwiseUnaryKernel.cpp | 90 +- src/cpu/kernels/CpuElementwiseUnaryKernel.h | 11 +- src/cpu/kernels/CpuFillKernel.cpp | 22 +- src/cpu/kernels/CpuFillKernel.h | 3 +- src/cpu/kernels/CpuFloorKernel.cpp | 39 +- src/cpu/kernels/CpuFloorKernel.h | 4 +- src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp | 56 +- src/cpu/kernels/CpuGemmInterleave4x4Kernel.h | 2 +- .../kernels/CpuGemmLowpMatrixMultiplyKernel.cpp | 1280 ++++++------- src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h | 4 +- .../kernels/CpuGemmLowpMatrixReductionKernel.cpp | 316 ++-- src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h | 29 +- .../CpuGemmLowpOffsetContributionKernel.cpp | 416 ++--- .../kernels/CpuGemmLowpOffsetContributionKernel.h | 23 +- ...GemmLowpOffsetContributionOutputStageKernel.cpp | 748 ++++---- ...puGemmLowpOffsetContributionOutputStageKernel.h | 28 +- .../CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp | 247 +-- .../CpuGemmLowpQuantizeDownInt32ScaleKernel.h | 19 +- ...tizeDownInt32ToInt16ScaleByFixedPointKernel.cpp | 153 +- ...antizeDownInt32ToInt16ScaleByFixedPointKernel.h | 29 +- ...ntizeDownInt32ToInt8ScaleByFixedPointKernel.cpp | 183 +- ...uantizeDownInt32ToInt8ScaleByFixedPointKernel.h | 32 +- ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp | 181 +- ...antizeDownInt32ToUint8ScaleByFixedPointKernel.h | 32 +- src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp | 39 +- src/cpu/kernels/CpuGemmMatrixAdditionKernel.h | 6 +- src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp | 85 +- src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h | 23 +- src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp | 43 +- src/cpu/kernels/CpuGemmTranspose1xWKernel.h | 2 +- src/cpu/kernels/CpuIm2ColKernel.cpp | 288 +-- src/cpu/kernels/CpuIm2ColKernel.h | 35 +- src/cpu/kernels/CpuKernelSelectionTypes.h | 31 +- src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp | 75 +- src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h | 13 +- src/cpu/kernels/CpuMulKernel.cpp | 1772 +++++++++--------- src/cpu/kernels/CpuMulKernel.h | 40 +- src/cpu/kernels/CpuPermuteKernel.cpp | 155 +- src/cpu/kernels/CpuPermuteKernel.h | 2 +- src/cpu/kernels/CpuPool2dKernel.cpp | 329 ++-- src/cpu/kernels/CpuPool2dKernel.h | 20 +- src/cpu/kernels/CpuPool3dKernel.cpp | 75 +- src/cpu/kernels/CpuPool3dKernel.h | 10 +- src/cpu/kernels/CpuQuantizeKernel.cpp | 167 +- src/cpu/kernels/CpuQuantizeKernel.h | 8 +- src/cpu/kernels/CpuReshapeKernel.cpp | 65 +- src/cpu/kernels/CpuReshapeKernel.h | 7 +- src/cpu/kernels/CpuScaleKernel.cpp | 607 +++--- src/cpu/kernels/CpuScaleKernel.h | 69 +- src/cpu/kernels/CpuSoftmaxKernel.cpp | 193 +- src/cpu/kernels/CpuSoftmaxKernel.h | 23 +- src/cpu/kernels/CpuSubKernel.cpp | 109 +- src/cpu/kernels/CpuSubKernel.h | 10 +- src/cpu/kernels/CpuTransposeKernel.cpp | 772 ++++---- src/cpu/kernels/CpuTransposeKernel.h | 2 +- src/cpu/kernels/CpuWeightsReshapeKernel.cpp | 86 +- src/cpu/kernels/CpuWeightsReshapeKernel.h | 2 +- src/cpu/kernels/CpuWinogradConv2dKernel.cpp | 59 +- src/cpu/kernels/CpuWinogradConv2dKernel.h | 11 +- src/cpu/kernels/activation/generic/neon/fp16.cpp | 4 +- src/cpu/kernels/activation/generic/neon/fp32.cpp | 2 +- src/cpu/kernels/activation/generic/neon/impl.h | 274 +-- src/cpu/kernels/activation/generic/neon/lut.cpp | 20 +- .../kernels/activation/generic/neon/qasymm8.cpp | 344 ++-- .../activation/generic/neon/qasymm8_signed.cpp | 300 +-- .../kernels/activation/generic/neon/qsymm16.cpp | 163 +- src/cpu/kernels/activation/generic/sve/fp16.cpp | 148 +- src/cpu/kernels/activation/generic/sve/fp32.cpp | 149 +- src/cpu/kernels/activation/generic/sve2/lut.cpp | 20 +- .../kernels/activation/generic/sve2/qasymm8.cpp | 264 +-- .../activation/generic/sve2/qasymm8_signed.cpp | 306 +-- .../kernels/activation/generic/sve2/qsymm16.cpp | 121 +- src/cpu/kernels/add/generic/neon/fp16.cpp | 5 +- src/cpu/kernels/add/generic/neon/fp32.cpp | 5 +- src/cpu/kernels/add/generic/neon/impl.cpp | 711 +++---- src/cpu/kernels/add/generic/neon/impl.h | 145 +- src/cpu/kernels/add/generic/neon/integer.cpp | 11 +- src/cpu/kernels/add/generic/neon/qasymm8.cpp | 6 +- .../kernels/add/generic/neon/qasymm8_signed.cpp | 6 +- src/cpu/kernels/add/generic/neon/qsymm16.cpp | 162 +- src/cpu/kernels/add/generic/sve/fp16.cpp | 5 +- src/cpu/kernels/add/generic/sve/fp32.cpp | 6 +- src/cpu/kernels/add/generic/sve/impl.cpp | 106 +- src/cpu/kernels/add/generic/sve/impl.h | 3 +- src/cpu/kernels/add/generic/sve/integer.cpp | 12 +- src/cpu/kernels/add/generic/sve2/qasymm8.cpp | 237 ++- .../kernels/add/generic/sve2/qasymm8_signed.cpp | 193 +- src/cpu/kernels/add/generic/sve2/qsymm16.cpp | 119 +- src/cpu/kernels/add/list.h | 7 +- src/cpu/kernels/addmuladd/generic/neon/fp16.cpp | 106 +- src/cpu/kernels/addmuladd/generic/neon/fp32.cpp | 104 +- src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp | 137 +- .../addmuladd/generic/neon/qasymm8_signed.cpp | 137 +- src/cpu/kernels/addmuladd/list.h | 5 +- .../assembly/CpuGemmAssemblyWrapperKernel.h | 12 +- src/cpu/kernels/assembly/arm_gemm.hpp | 91 +- .../kernels/assembly/arm_gemm_compute_iface.hpp | 38 +- src/cpu/kernels/assembly/gemm_common.hpp | 74 +- src/cpu/kernels/assembly/ndrange.hpp | 19 +- .../boundingboxtransform/generic/neon/fp16.cpp | 6 +- .../boundingboxtransform/generic/neon/fp32.cpp | 6 +- .../boundingboxtransform/generic/neon/impl.cpp | 85 +- .../boundingboxtransform/generic/neon/impl.h | 85 +- .../boundingboxtransform/generic/neon/qsymm16.cpp | 6 +- src/cpu/kernels/boundingboxtransform/list.h | 5 +- src/cpu/kernels/cast/generic/neon/fp16.cpp | 393 ++-- src/cpu/kernels/cast/list.h | 7 +- src/cpu/kernels/conv3d/neon/list.h | 165 +- src/cpu/kernels/conv3d/neon/quantized.h | 257 +-- src/cpu/kernels/crop/generic/neon/crop_helper.h | 4 +- src/cpu/kernels/crop/generic/neon/fp16.cpp | 17 +- src/cpu/kernels/crop/generic/neon/fp32.cpp | 17 +- src/cpu/kernels/crop/generic/neon/impl.h | 49 +- src/cpu/kernels/crop/generic/neon/integer.cpp | 92 +- src/cpu/kernels/crop/list.h | 6 +- .../kernels/depthwiseconv2d/generic/neon/fp16.cpp | 11 +- .../kernels/depthwiseconv2d/generic/neon/fp32.cpp | 11 +- .../kernels/depthwiseconv2d/generic/neon/impl.cpp | 592 +++--- .../kernels/depthwiseconv2d/generic/neon/impl.h | 279 +-- .../depthwiseconv2d/generic/neon/qasymm8.cpp | 20 +- .../generic/neon/qasymm8_signed.cpp | 20 +- src/cpu/kernels/depthwiseconv2d/list.h | 6 +- src/cpu/kernels/directconv2d/list.h | 5 +- src/cpu/kernels/directconv2d/nchw/all.cpp | 142 +- src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp | 5 +- src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp | 252 +-- src/cpu/kernels/directconv2d/nhwc/neon/impl.h | 4 +- .../elementwise_binary/generic/neon/fp16.cpp | 73 +- .../elementwise_binary/generic/neon/fp32.cpp | 73 +- .../kernels/elementwise_binary/generic/neon/impl.h | 852 +++++---- .../elementwise_binary/generic/neon/integer.cpp | 173 +- .../elementwise_binary/generic/neon/qasymm8.cpp | 76 +- .../generic/neon/qasymm8_signed.cpp | 74 +- .../elementwise_binary/generic/sve/fp16.cpp | 73 +- .../elementwise_binary/generic/sve/fp32.cpp | 71 +- .../elementwise_binary/generic/sve/impl.cpp | 250 +-- .../kernels/elementwise_binary/generic/sve/impl.h | 16 +- .../elementwise_binary/generic/sve/integer.cpp | 171 +- .../kernels/elementwise_binary/generic/sve2/impl.h | 379 ++-- .../elementwise_binary/generic/sve2/qasymm8.cpp | 76 +- .../generic/sve2/qasymm8_signed.cpp | 74 +- .../elementwise_unary/generic/neon/fp16.cpp | 6 +- .../elementwise_unary/generic/neon/fp32.cpp | 6 +- .../kernels/elementwise_unary/generic/neon/impl.h | 244 +-- .../elementwise_unary/generic/neon/integer.cpp | 6 +- .../kernels/elementwise_unary/generic/neon/q8.cpp | 21 +- .../elementwise_unary/generic/neon/qasymm8.cpp | 4 +- .../generic/neon/qasymm8_signed.cpp | 4 +- .../kernels/elementwise_unary/generic/sve/fp16.cpp | 6 +- .../kernels/elementwise_unary/generic/sve/fp32.cpp | 6 +- .../kernels/elementwise_unary/generic/sve/impl.cpp | 44 +- .../elementwise_unary/generic/sve/integer.cpp | 6 +- .../kernels/elementwise_unary/generic/sve2/q8.cpp | 20 +- src/cpu/kernels/floor/list.h | 3 +- src/cpu/kernels/floor/neon/fp16.cpp | 4 +- src/cpu/kernels/floor/neon/fp32.cpp | 4 +- .../fuse_batch_normalization/generic/fp16.cpp | 16 +- .../fuse_batch_normalization/generic/fp32.cpp | 16 +- .../fuse_batch_normalization/generic/impl.h | 120 +- src/cpu/kernels/fuse_batch_normalization/list.h | 15 +- .../kernels/fuse_batch_normalization/nchw/all.cpp | 147 +- .../fuse_batch_normalization/nhwc/neon/fp16.cpp | 16 +- .../fuse_batch_normalization/nhwc/neon/fp32.cpp | 16 +- .../fuse_batch_normalization/nhwc/neon/impl.h | 143 +- .../kernels/gemm_matrix_add/generic/neon/fp16.cpp | 44 +- .../kernels/gemm_matrix_add/generic/neon/impl.cpp | 49 +- .../kernels/gemm_matrix_mul/generic/neon/fp16.cpp | 472 ++--- .../kernels/gemm_matrix_mul/generic/neon/fp32.cpp | 15 +- .../kernels/gemm_matrix_mul/generic/neon/impl.cpp | 898 ++++----- .../kernels/gemm_matrix_mul/generic/neon/impl.h | 7 +- src/cpu/kernels/gemm_matrix_mul/list.h | 5 +- src/cpu/kernels/genproposals/generic/neon/fp16.cpp | 7 +- src/cpu/kernels/genproposals/generic/neon/fp32.cpp | 7 +- src/cpu/kernels/genproposals/generic/neon/impl.cpp | 43 +- src/cpu/kernels/genproposals/generic/neon/impl.h | 41 +- .../kernels/genproposals/generic/neon/qsymm16.cpp | 7 +- src/cpu/kernels/instancenorm/generic/neon/fp16.cpp | 201 +- src/cpu/kernels/instancenorm/generic/neon/fp32.cpp | 8 +- src/cpu/kernels/instancenorm/generic/neon/impl.cpp | 173 +- src/cpu/kernels/instancenorm/generic/neon/impl.h | 6 +- src/cpu/kernels/instancenorm/list.h | 5 +- .../CpuDepthwiseConv2dAssemblyWrapperKernel.cpp | 165 +- .../CpuDepthwiseConv2dAssemblyWrapperKernel.h | 23 +- .../internal/CpuPool2dAssemblyWrapperKernel.cpp | 99 +- .../internal/CpuPool2dAssemblyWrapperKernel.h | 13 +- src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp | 6 +- src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp | 10 +- src/cpu/kernels/l2normlayer/generic/neon/impl.h | 96 +- src/cpu/kernels/l2normlayer/list.h | 5 +- src/cpu/kernels/lut/generic/neon/u8.cpp | 736 ++++---- src/cpu/kernels/lut/generic/sve2/u8.cpp | 10 +- src/cpu/kernels/lut/list.h | 10 +- src/cpu/kernels/maxunpool/generic/neon/impl.h | 17 +- .../kernels/meanstddevnorm/generic/neon/fp16.cpp | 102 +- .../kernels/meanstddevnorm/generic/neon/impl.cpp | 97 +- .../meanstddevnorm/generic/neon/qasymm8.cpp | 124 +- src/cpu/kernels/pool2d/neon/fp16.cpp | 442 +++-- src/cpu/kernels/pool2d/neon/fp32.cpp | 562 +++--- src/cpu/kernels/pool2d/neon/list.h | 38 +- src/cpu/kernels/pool2d/neon/nchw/all.cpp | 1097 ++++++----- src/cpu/kernels/pool2d/neon/qasymm8.cpp | 12 +- src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp | 12 +- src/cpu/kernels/pool2d/neon/quantized.h | 961 +++++----- src/cpu/kernels/pool3d/neon/impl.h | 417 +++-- src/cpu/kernels/pool3d/neon/quantized.h | 403 ++-- src/cpu/kernels/range/generic/neon/fp16.cpp | 4 +- src/cpu/kernels/range/generic/neon/fp32.cpp | 4 +- src/cpu/kernels/range/generic/neon/impl.h | 48 +- src/cpu/kernels/range/list.h | 3 +- src/cpu/kernels/roialign/generic/neon/fp16.cpp | 7 +- src/cpu/kernels/roialign/generic/neon/fp32.cpp | 7 +- src/cpu/kernels/roialign/generic/neon/impl.h | 192 +- src/cpu/kernels/roialign/generic/neon/qasymm8.cpp | 7 +- .../roialign/generic/neon/qasymm8_signed.cpp | 7 +- src/cpu/kernels/roialign/list.h | 6 +- src/cpu/kernels/scale/neon/fp16.cpp | 189 +- src/cpu/kernels/scale/neon/integer.cpp | 416 +++-- src/cpu/kernels/scale/neon/list.h | 163 +- src/cpu/kernels/scale/neon/qasymm8.cpp | 217 ++- src/cpu/kernels/scale/neon/qasymm8_signed.cpp | 206 ++- src/cpu/kernels/scale/sve/fp16.cpp | 75 +- src/cpu/kernels/scale/sve/fp32.cpp | 76 +- src/cpu/kernels/scale/sve/integer.cpp | 145 +- src/cpu/kernels/scale/sve/list.h | 8 +- src/cpu/kernels/scale/sve/qasymm8.cpp | 74 +- src/cpu/kernels/scale/sve/qasymm8_signed.cpp | 74 +- src/cpu/kernels/select/generic/neon/fp16.cpp | 12 +- src/cpu/kernels/select/generic/neon/fp32.cpp | 10 +- src/cpu/kernels/select/generic/neon/impl.h | 111 +- src/cpu/kernels/select/generic/neon/integer.cpp | 40 +- src/cpu/kernels/softmax/generic/neon/fp16.cpp | 12 +- src/cpu/kernels/softmax/generic/neon/fp32.cpp | 12 +- src/cpu/kernels/softmax/generic/neon/impl.cpp | 281 +-- src/cpu/kernels/softmax/generic/neon/impl.h | 248 +-- src/cpu/kernels/softmax/generic/neon/qasymm8.cpp | 12 +- .../softmax/generic/neon/qasymm8_signed.cpp | 12 +- src/cpu/kernels/softmax/generic/sve/fp16.cpp | 12 +- src/cpu/kernels/softmax/generic/sve/fp32.cpp | 12 +- src/cpu/kernels/softmax/generic/sve/impl.cpp | 211 ++- src/cpu/kernels/softmax/generic/sve/impl.h | 9 +- src/cpu/kernels/softmax/generic/sve/qasymm8.cpp | 3 +- .../kernels/softmax/generic/sve/qasymm8_signed.cpp | 3 +- src/cpu/kernels/softmax/generic/sve2/impl.cpp | 289 +-- src/cpu/kernels/softmax/generic/sve2/impl.h | 9 +- src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp | 12 +- .../softmax/generic/sve2/qasymm8_signed.cpp | 12 +- src/cpu/kernels/softmax/list.h | 9 +- src/cpu/kernels/sub/neon/list.h | 119 +- src/cpu/kernels/sub/neon/qasymm8.cpp | 9 +- src/cpu/kernels/sub/neon/qasymm8_signed.cpp | 9 +- src/cpu/kernels/sub/neon/qsymm16.cpp | 166 +- src/cpu/operators/CpuActivation.cpp | 14 +- src/cpu/operators/CpuActivation.h | 1 + src/cpu/operators/CpuAdd.cpp | 17 +- src/cpu/operators/CpuAdd.h | 13 +- src/cpu/operators/CpuAddMulAdd.cpp | 85 +- src/cpu/operators/CpuAddMulAdd.h | 26 +- src/cpu/operators/CpuCast.cpp | 3 +- src/cpu/operators/CpuConcatenate.cpp | 34 +- src/cpu/operators/CpuConcatenate.h | 4 +- src/cpu/operators/CpuConv2d.cpp | 140 +- src/cpu/operators/CpuConv2d.h | 40 +- .../operators/CpuConvertFullyConnectedWeights.cpp | 13 +- .../operators/CpuConvertFullyConnectedWeights.h | 8 +- src/cpu/operators/CpuCopy.cpp | 3 +- src/cpu/operators/CpuDepthwiseConv2d.cpp | 157 +- src/cpu/operators/CpuDepthwiseConv2d.h | 86 +- .../CpuDepthwiseConv2dAssemblyDispatch.cpp | 29 +- .../operators/CpuDepthwiseConv2dAssemblyDispatch.h | 17 +- src/cpu/operators/CpuDequantize.cpp | 1 + src/cpu/operators/CpuDirectConv2d.cpp | 50 +- src/cpu/operators/CpuDirectConv2d.h | 24 +- src/cpu/operators/CpuDirectConv3d.cpp | 27 +- src/cpu/operators/CpuDirectConv3d.h | 16 +- src/cpu/operators/CpuElementwise.cpp | 18 +- src/cpu/operators/CpuElementwise.h | 5 +- src/cpu/operators/CpuElementwiseUnary.cpp | 5 +- src/cpu/operators/CpuElementwiseUnary.h | 3 +- src/cpu/operators/CpuFill.cpp | 3 +- src/cpu/operators/CpuFill.h | 1 + src/cpu/operators/CpuFlatten.cpp | 6 +- src/cpu/operators/CpuFloor.cpp | 3 +- src/cpu/operators/CpuFullyConnected.cpp | 225 ++- src/cpu/operators/CpuFullyConnected.h | 52 +- src/cpu/operators/CpuGemm.cpp | 198 +- src/cpu/operators/CpuGemm.h | 66 +- src/cpu/operators/CpuGemmConv2d.cpp | 378 ++-- src/cpu/operators/CpuGemmConv2d.h | 78 +- src/cpu/operators/CpuGemmDirectConv2d.cpp | 85 +- src/cpu/operators/CpuGemmDirectConv2d.h | 17 +- .../operators/CpuGemmLowpMatrixMultiplyCore.cpp | 365 ++-- src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h | 17 +- src/cpu/operators/CpuGemmLowpOutputStage.cpp | 52 +- src/cpu/operators/CpuGemmLowpOutputStage.h | 6 +- src/cpu/operators/CpuMatMul.cpp | 113 +- src/cpu/operators/CpuMatMul.h | 34 +- src/cpu/operators/CpuMaxUnpooling.cpp | 13 +- src/cpu/operators/CpuMaxUnpooling.h | 8 +- src/cpu/operators/CpuMul.cpp | 27 +- src/cpu/operators/CpuMul.h | 25 +- src/cpu/operators/CpuPermute.cpp | 5 +- src/cpu/operators/CpuPool2d.cpp | 35 +- src/cpu/operators/CpuPool2d.h | 11 +- src/cpu/operators/CpuPool3d.cpp | 6 +- src/cpu/operators/CpuPool3d.h | 3 +- src/cpu/operators/CpuQuantize.cpp | 1 + src/cpu/operators/CpuReshape.cpp | 7 +- src/cpu/operators/CpuReshape.h | 5 +- src/cpu/operators/CpuScale.cpp | 128 +- src/cpu/operators/CpuScale.h | 9 +- src/cpu/operators/CpuSoftmax.cpp | 101 +- src/cpu/operators/CpuSoftmax.h | 6 +- src/cpu/operators/CpuSub.cpp | 17 +- src/cpu/operators/CpuSub.h | 13 +- src/cpu/operators/CpuTranspose.cpp | 5 +- src/cpu/operators/CpuWinogradConv2d.cpp | 263 +-- src/cpu/operators/CpuWinogradConv2d.h | 62 +- .../operators/internal/CpuGemmAssemblyDispatch.cpp | 393 ++-- .../operators/internal/CpuGemmAssemblyDispatch.h | 55 +- src/cpu/utils/CpuAuxTensorHandler.h | 26 +- .../runtime/gpu/cl/ClKernelRuntime.cpp | 61 +- .../runtime/gpu/cl/ClKernelRuntime.h | 11 +- .../runtime/gpu/cl/ClWorkloadRuntime.cpp | 81 +- .../cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp | 7 +- .../cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h | 5 +- src/dynamic_fusion/sketch/ArgumentPack.h | 45 +- .../attributes/DepthwiseConv2dAttributes.cpp | 3 +- .../sketch/attributes/Pool2dAttributes.cpp | 1 + src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h | 15 +- .../sketch/gpu/GpuKernelComponentGraph.cpp | 20 +- .../sketch/gpu/GpuKernelComponentGraph.h | 18 +- .../sketch/gpu/GpuKernelComponentGroup.cpp | 105 +- .../sketch/gpu/GpuKernelComponentGroup.h | 23 +- .../sketch/gpu/GpuKernelComponentStream.cpp | 10 +- .../sketch/gpu/GpuKernelComponentStream.h | 5 +- .../sketch/gpu/GpuKernelSourceCode.h | 1 + src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp | 10 +- src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp | 48 +- src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h | 4 +- .../sketch/gpu/GpuWorkloadContext.cpp | 16 +- .../sketch/gpu/GpuWorkloadContextImpl.h | 6 +- .../sketch/gpu/GpuWorkloadSketch.cpp | 4 +- .../sketch/gpu/GpuWorkloadSketchImpl.h | 11 +- .../sketch/gpu/GpuWorkloadSourceCode.h | 56 +- src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h | 1 + .../gpu/ckw_driver/GpuCkwComponentArgument.cpp | 7 +- .../gpu/ckw_driver/GpuCkwComponentArgument.h | 6 +- .../sketch/gpu/ckw_driver/GpuCkwDriver.cpp | 21 +- .../sketch/gpu/ckw_driver/GpuCkwDriver.h | 4 +- .../sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp | 12 +- .../gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp | 1 + .../gpu/ckw_driver/GpuCkwScopedKernelWriter.h | 2 +- .../sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp | 20 +- .../sketch/gpu/ckw_driver/GpuCkwVariableTable.h | 8 +- .../sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h | 10 +- .../gpu/ckw_driver/components/GpuCkwActivation.cpp | 34 +- .../gpu/ckw_driver/components/GpuCkwActivation.h | 10 +- .../gpu/ckw_driver/components/GpuCkwCast.cpp | 44 +- .../sketch/gpu/ckw_driver/components/GpuCkwCast.h | 10 +- .../ckw_driver/components/GpuCkwDirectConv2d.cpp | 49 +- .../components/GpuCkwElementwiseBinary.cpp | 84 +- .../components/GpuCkwElementwiseBinary.h | 14 +- .../gpu/ckw_driver/components/GpuCkwPool2d.cpp | 171 +- .../gpu/ckw_driver/components/GpuCkwPool2d.h | 8 +- .../gpu/ckw_driver/components/GpuCkwResize.cpp | 76 +- .../gpu/ckw_driver/components/GpuCkwStore.cpp | 10 +- .../sketch/gpu/ckw_driver/components/GpuCkwStore.h | 6 +- .../gpu/ckw_driver/components/utils/WriterHelper.h | 31 +- .../components/utils/type_converter/Common.h | 35 +- .../utils/type_converter/ElementwiseBinary.h | 3 +- .../gpu/components/GpuKernelComponentFactory.h | 7 +- .../sketch/gpu/components/IGpuKernelComponent.h | 15 +- .../gpu/components/cl/ClComponentActivation.cpp | 12 +- .../gpu/components/cl/ClComponentActivation.h | 18 +- .../sketch/gpu/components/cl/ClComponentCast.cpp | 30 +- .../sketch/gpu/components/cl/ClComponentCast.h | 10 +- .../components/cl/ClComponentDepthwiseConv2d.cpp | 57 +- .../gpu/components/cl/ClComponentDepthwiseConv2d.h | 34 +- .../gpu/components/cl/ClComponentDirectConv2d.cpp | 64 +- .../gpu/components/cl/ClComponentDirectConv2d.h | 26 +- .../components/cl/ClComponentElementwiseBinary.cpp | 65 +- .../components/cl/ClComponentElementwiseBinary.h | 12 +- .../cl/ClComponentLogits1DMaxShiftExpSum.cpp | 14 +- .../cl/ClComponentLogits1DMaxShiftExpSum.h | 7 +- .../gpu/components/cl/ClComponentLogits1DNorm.cpp | 14 +- .../gpu/components/cl/ClComponentLogits1DNorm.h | 7 +- .../sketch/gpu/components/cl/ClComponentPool2d.cpp | 49 +- .../sketch/gpu/components/cl/ClComponentPool2d.h | 21 +- .../gpu/components/cl/ClComponentReshape.cpp | 12 +- .../sketch/gpu/components/cl/ClComponentReshape.h | 5 +- .../sketch/gpu/components/cl/ClComponentResize.cpp | 14 +- .../sketch/gpu/components/cl/ClComponentResize.h | 13 +- .../sketch/gpu/components/cl/ClComponentStore.cpp | 20 +- .../sketch/gpu/components/cl/ClComponentStore.h | 5 +- .../utils/type_printer/ElementwiseBinary.h | 22 +- src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp | 19 +- .../sketch/gpu/operators/GpuCast.cpp | 52 +- .../sketch/gpu/operators/GpuClamp.cpp | 44 +- .../sketch/gpu/operators/GpuConv2d.cpp | 65 +- .../sketch/gpu/operators/GpuDepthwiseConv2d.cpp | 90 +- src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp | 13 +- .../sketch/gpu/operators/GpuOutput.cpp | 19 +- .../sketch/gpu/operators/GpuPool2d.cpp | 43 +- .../sketch/gpu/operators/GpuReshape.cpp | 27 +- .../sketch/gpu/operators/GpuResize.cpp | 40 +- .../sketch/gpu/operators/GpuSigmoid.cpp | 31 +- .../sketch/gpu/operators/GpuSoftmax.cpp | 38 +- src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp | 19 +- .../sketch/gpu/operators/GpuTanh.cpp | 35 +- .../internal/GpuElementwiseBinaryCommon.cpp | 19 +- .../gpu/template_writer/GpuKernelVariableTable.cpp | 36 +- .../gpu/template_writer/GpuKernelVariableTable.h | 17 +- .../template_writer/IGpuTemplateComponentWriter.h | 8 +- .../template_writer/cl/ClTemplateActivation.cpp | 26 +- .../gpu/template_writer/cl/ClTemplateActivation.h | 1 + .../gpu/template_writer/cl/ClTemplateCast.cpp | 30 +- .../cl/ClTemplateDepthwiseConv2d.cpp | 81 +- .../template_writer/cl/ClTemplateDepthwiseConv2d.h | 1 + .../template_writer/cl/ClTemplateDirectConv2d.cpp | 112 +- .../template_writer/cl/ClTemplateDirectConv2d.h | 1 + .../cl/ClTemplateElementwiseBinary.cpp | 94 +- .../cl/ClTemplateElementwiseBinary.h | 5 +- .../cl/ClTemplateLogits1DMaxShiftExpSum.cpp | 57 +- .../cl/ClTemplateLogits1DMaxShiftExpSum.h | 4 +- .../template_writer/cl/ClTemplateLogits1DNorm.cpp | 35 +- .../gpu/template_writer/cl/ClTemplatePool2d.cpp | 92 +- .../gpu/template_writer/cl/ClTemplatePool2d.h | 1 + .../gpu/template_writer/cl/ClTemplateReshape.cpp | 28 +- .../gpu/template_writer/cl/ClTemplateReshape.h | 4 +- .../gpu/template_writer/cl/ClTemplateResize.cpp | 56 +- .../gpu/template_writer/cl/ClTemplateStore.cpp | 16 +- .../gpu/template_writer/cl/ClTemplateStore.h | 1 + .../gpu/template_writer/cl/ClTemplateWriter.cpp | 59 +- src/dynamic_fusion/sketch/utils/DependencyGraph.h | 182 +- src/dynamic_fusion/utils/Utils.h | 16 +- src/gpu/cl/ClContext.cpp | 17 +- src/gpu/cl/ClContext.h | 12 +- src/gpu/cl/ClKernelLibrary.cpp | 679 ++++--- src/gpu/cl/ClKernelLibrary.h | 14 +- src/gpu/cl/ClQueue.cpp | 7 +- src/gpu/cl/ClQueue.h | 4 +- src/gpu/cl/ClTensor.cpp | 7 +- src/gpu/cl/ClTensor.h | 8 +- src/gpu/cl/IClKernel.h | 1 + src/gpu/cl/kernels/ClActivationKernel.cpp | 115 +- src/gpu/cl/kernels/ClActivationKernel.h | 8 +- src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp | 24 +- src/gpu/cl/kernels/ClBatchConcatenateKernel.h | 5 +- src/gpu/cl/kernels/ClCastKernel.cpp | 42 +- src/gpu/cl/kernels/ClCastKernel.h | 3 +- src/gpu/cl/kernels/ClCol2ImKernel.cpp | 47 +- src/gpu/cl/kernels/ClCol2ImKernel.h | 10 +- .../ClConvertFullyConnectedWeightsKernel.cpp | 26 +- .../kernels/ClConvertFullyConnectedWeightsKernel.h | 11 +- src/gpu/cl/kernels/ClCopyKernel.cpp | 36 +- src/gpu/cl/kernels/ClCopyKernel.h | 5 +- src/gpu/cl/kernels/ClCropKernel.cpp | 45 +- src/gpu/cl/kernels/ClCropKernel.h | 19 +- src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp | 28 +- src/gpu/cl/kernels/ClDepthConcatenateKernel.h | 3 +- src/gpu/cl/kernels/ClDequantizeKernel.cpp | 27 +- src/gpu/cl/kernels/ClDirectConv2dKernel.cpp | 192 +- src/gpu/cl/kernels/ClDirectConv2dKernel.h | 26 +- src/gpu/cl/kernels/ClDirectConv3dKernel.cpp | 72 +- src/gpu/cl/kernels/ClDirectConv3dKernel.h | 13 +- src/gpu/cl/kernels/ClElementwiseKernel.cpp | 246 ++- src/gpu/cl/kernels/ClElementwiseKernel.h | 81 +- src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp | 44 +- src/gpu/cl/kernels/ClElementwiseUnaryKernel.h | 5 +- src/gpu/cl/kernels/ClFillKernel.cpp | 27 +- src/gpu/cl/kernels/ClFillKernel.h | 5 +- src/gpu/cl/kernels/ClFloorKernel.cpp | 15 +- .../ClGemmLowpMatrixMultiplyNativeKernel.cpp | 124 +- .../kernels/ClGemmLowpMatrixMultiplyNativeKernel.h | 26 +- .../ClGemmLowpMatrixMultiplyReshapedKernel.cpp | 108 +- .../ClGemmLowpMatrixMultiplyReshapedKernel.h | 26 +- ...GemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp | 245 ++- ...ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h | 40 +- ...LowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp | 214 ++- ...mmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h | 36 +- .../kernels/ClGemmLowpOffsetContributionKernel.cpp | 85 +- .../kernels/ClGemmLowpOffsetContributionKernel.h | 16 +- ...GemmLowpOffsetContributionOutputStageKernel.cpp | 123 +- ...ClGemmLowpOffsetContributionOutputStageKernel.h | 29 +- ...owpQuantizeDownInt32ScaleByFixedPointKernel.cpp | 61 +- ...mLowpQuantizeDownInt32ScaleByFixedPointKernel.h | 11 +- ...GemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp | 56 +- ...ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h | 11 +- .../ClGemmLowpQuantizeDownInt32ScaleKernel.cpp | 64 +- .../ClGemmLowpQuantizeDownInt32ScaleKernel.h | 13 +- src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp | 60 +- src/gpu/cl/kernels/ClGemmLowpReductionKernel.h | 22 +- .../kernels/ClGemmMatrixMultiplyNativeKernel.cpp | 190 +- .../cl/kernels/ClGemmMatrixMultiplyNativeKernel.h | 33 +- .../kernels/ClGemmMatrixMultiplyReshapedKernel.cpp | 168 +- .../kernels/ClGemmMatrixMultiplyReshapedKernel.h | 41 +- .../ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp | 166 +- .../ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h | 48 +- ...GemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp | 141 +- ...ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h | 27 +- .../cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp | 48 +- src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h | 13 +- .../cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp | 53 +- src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h | 7 +- src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp | 24 +- src/gpu/cl/kernels/ClHeightConcatenateKernel.h | 5 +- src/gpu/cl/kernels/ClIm2ColKernel.cpp | 154 +- src/gpu/cl/kernels/ClIm2ColKernel.h | 31 +- ...ClIndirectConv2dAddressPrecalculationKernel.cpp | 52 +- .../ClIndirectConv2dAddressPrecalculationKernel.h | 15 +- src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp | 132 +- src/gpu/cl/kernels/ClIndirectConv2dKernel.h | 24 +- src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp | 63 +- src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h | 14 +- .../cl/kernels/ClMatMulLowpNativeMMULKernel.cpp | 51 +- src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h | 14 +- src/gpu/cl/kernels/ClMatMulNativeKernel.cpp | 80 +- src/gpu/cl/kernels/ClMatMulNativeKernel.h | 16 +- src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp | 58 +- src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h | 19 +- src/gpu/cl/kernels/ClMulKernel.cpp | 185 +- src/gpu/cl/kernels/ClMulKernel.h | 31 +- src/gpu/cl/kernels/ClPermuteKernel.cpp | 27 +- src/gpu/cl/kernels/ClPermuteKernel.h | 5 +- src/gpu/cl/kernels/ClPool2dKernel.cpp | 184 +- src/gpu/cl/kernels/ClPool2dKernel.h | 15 +- src/gpu/cl/kernels/ClPool3dKernel.cpp | 103 +- src/gpu/cl/kernels/ClPool3dKernel.h | 9 +- src/gpu/cl/kernels/ClQuantizeKernel.cpp | 27 +- src/gpu/cl/kernels/ClReshapeKernel.cpp | 30 +- src/gpu/cl/kernels/ClReshapeKernel.h | 2 +- src/gpu/cl/kernels/ClScaleKernel.cpp | 68 +- src/gpu/cl/kernels/ClScaleKernel.h | 6 +- src/gpu/cl/kernels/ClSoftmaxKernel.cpp | 114 +- src/gpu/cl/kernels/ClSoftmaxKernel.h | 20 +- src/gpu/cl/kernels/ClTransposeKernel.cpp | 24 +- .../cl/kernels/ClTransposedConvolutionKernel.cpp | 64 +- src/gpu/cl/kernels/ClTransposedConvolutionKernel.h | 17 +- src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp | 44 +- src/gpu/cl/kernels/ClWeightsReshapeKernel.h | 11 +- .../kernels/ClWidthConcatenate2TensorsKernel.cpp | 30 +- .../cl/kernels/ClWidthConcatenate2TensorsKernel.h | 4 +- .../kernels/ClWidthConcatenate4TensorsKernel.cpp | 65 +- .../cl/kernels/ClWidthConcatenate4TensorsKernel.h | 21 +- src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp | 21 +- src/gpu/cl/kernels/ClWidthConcatenateKernel.h | 5 +- .../cl/kernels/ClWinogradFilterTransformKernel.cpp | 46 +- .../cl/kernels/ClWinogradFilterTransformKernel.h | 8 +- .../cl/kernels/ClWinogradInputTransformKernel.cpp | 63 +- .../cl/kernels/ClWinogradInputTransformKernel.h | 22 +- .../cl/kernels/ClWinogradOutputTransformKernel.cpp | 110 +- .../cl/kernels/ClWinogradOutputTransformKernel.h | 23 +- src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp | 79 +- src/gpu/cl/kernels/gemm/ClGemmHelpers.h | 36 +- src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h | 12 +- .../native/ClGemmDefaultConfigNativeBifrost.cpp | 90 +- .../gemm/native/ClGemmDefaultConfigNativeBifrost.h | 21 +- .../native/ClGemmDefaultConfigNativeMidgard.cpp | 19 +- .../gemm/native/ClGemmDefaultConfigNativeMidgard.h | 6 +- .../native/ClGemmDefaultConfigNativeValhall.cpp | 54 +- .../gemm/native/ClGemmDefaultConfigNativeValhall.h | 12 +- .../kernels/gemm/native/ClGemmNativeKernelConfig.h | 2 +- .../ClGemmDefaultConfigReshapedBifrost.cpp | 163 +- .../reshaped/ClGemmDefaultConfigReshapedBifrost.h | 27 +- .../ClGemmDefaultConfigReshapedValhall.cpp | 168 +- .../reshaped/ClGemmDefaultConfigReshapedValhall.h | 18 +- .../gemm/reshaped/ClGemmReshapedKernelConfig.h | 2 +- .../ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp | 242 +-- .../ClGemmDefaultConfigReshapedRhsOnlyBifrost.h | 39 +- .../ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp | 550 +++--- .../ClGemmDefaultConfigReshapedRhsOnlyValhall.h | 27 +- .../ClGemmReshapedOnlyRhsKernelConfig.h | 2 +- src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp | 15 +- src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h | 12 +- src/gpu/cl/operators/ClActivation.cpp | 20 +- src/gpu/cl/operators/ClActivation.h | 6 +- src/gpu/cl/operators/ClAdd.cpp | 18 +- src/gpu/cl/operators/ClAdd.h | 12 +- src/gpu/cl/operators/ClCast.cpp | 8 +- src/gpu/cl/operators/ClCast.h | 3 +- src/gpu/cl/operators/ClConcatenate.cpp | 73 +- src/gpu/cl/operators/ClConcatenate.h | 9 +- src/gpu/cl/operators/ClConv2d.cpp | 195 +- src/gpu/cl/operators/ClConv2d.h | 28 +- .../operators/ClConvertFullyConnectedWeights.cpp | 16 +- .../cl/operators/ClConvertFullyConnectedWeights.h | 11 +- src/gpu/cl/operators/ClCopy.cpp | 5 +- src/gpu/cl/operators/ClCopy.h | 6 +- src/gpu/cl/operators/ClCrop.cpp | 23 +- src/gpu/cl/operators/ClCrop.h | 20 +- src/gpu/cl/operators/ClDequantize.cpp | 4 +- src/gpu/cl/operators/ClDirectConv2d.cpp | 41 +- src/gpu/cl/operators/ClDirectConv2d.h | 20 +- src/gpu/cl/operators/ClDirectConv3d.cpp | 14 +- src/gpu/cl/operators/ClDirectConv3d.h | 17 +- src/gpu/cl/operators/ClElementwiseOperations.cpp | 58 +- src/gpu/cl/operators/ClElementwiseOperations.h | 56 +- src/gpu/cl/operators/ClElementwiseUnary.cpp | 3 +- src/gpu/cl/operators/ClFill.cpp | 10 +- src/gpu/cl/operators/ClFill.h | 6 +- src/gpu/cl/operators/ClFlatten.cpp | 3 +- src/gpu/cl/operators/ClFloor.cpp | 3 +- src/gpu/cl/operators/ClFullyConnected.cpp | 282 +-- src/gpu/cl/operators/ClFullyConnected.h | 56 +- src/gpu/cl/operators/ClGemm.cpp | 472 +++-- src/gpu/cl/operators/ClGemm.h | 89 +- src/gpu/cl/operators/ClGemmConv2d.cpp | 283 +-- src/gpu/cl/operators/ClGemmConv2d.h | 35 +- .../cl/operators/ClGemmLowpMatrixMultiplyCore.cpp | 578 +++--- .../cl/operators/ClGemmLowpMatrixMultiplyCore.h | 33 +- src/gpu/cl/operators/ClGemmLowpOutputStage.cpp | 23 +- src/gpu/cl/operators/ClGemmLowpOutputStage.h | 11 +- src/gpu/cl/operators/ClIndirectConv2d.cpp | 52 +- src/gpu/cl/operators/ClIndirectConv2d.h | 25 +- src/gpu/cl/operators/ClLogicalNot.cpp | 3 +- src/gpu/cl/operators/ClMatMul.cpp | 30 +- src/gpu/cl/operators/ClMatMul.h | 19 +- src/gpu/cl/operators/ClMul.cpp | 36 +- src/gpu/cl/operators/ClMul.h | 30 +- src/gpu/cl/operators/ClPRelu.cpp | 12 +- src/gpu/cl/operators/ClPRelu.h | 3 +- src/gpu/cl/operators/ClPermute.cpp | 10 +- src/gpu/cl/operators/ClPermute.h | 7 +- src/gpu/cl/operators/ClPool2d.cpp | 14 +- src/gpu/cl/operators/ClPool2d.h | 11 +- src/gpu/cl/operators/ClPool3d.cpp | 8 +- src/gpu/cl/operators/ClPool3d.h | 5 +- src/gpu/cl/operators/ClQuantize.cpp | 4 +- src/gpu/cl/operators/ClReshape.cpp | 5 +- src/gpu/cl/operators/ClScale.cpp | 11 +- src/gpu/cl/operators/ClScale.h | 4 +- src/gpu/cl/operators/ClSoftmax.cpp | 76 +- src/gpu/cl/operators/ClSoftmax.h | 16 +- src/gpu/cl/operators/ClSub.cpp | 18 +- src/gpu/cl/operators/ClSub.h | 12 +- src/gpu/cl/operators/ClTranspose.cpp | 5 +- src/gpu/cl/operators/ClTransposedConvolution.cpp | 19 +- src/gpu/cl/operators/ClTransposedConvolution.h | 19 +- src/gpu/cl/operators/ClWinogradConv2d.cpp | 175 +- src/gpu/cl/operators/ClWinogradConv2d.h | 26 +- src/gpu/cl/utils/ClAuxTensorHandler.h | 26 +- src/graph/DataLayerVisitor.cpp | 24 +- src/graph/Graph.cpp | 38 +- src/graph/GraphBuilder.cpp | 278 ++- src/graph/GraphContext.cpp | 15 +- src/graph/GraphManager.cpp | 25 +- src/graph/INode.cpp | 6 +- src/graph/INodeVisitor.cpp | 1 + src/graph/PassManager.cpp | 19 +- src/graph/Tensor.cpp | 8 +- src/graph/TypeLoader.cpp | 42 +- src/graph/Utils.cpp | 55 +- src/graph/Workload.cpp | 11 +- src/graph/algorithms/TopologicalSort.cpp | 36 +- src/graph/backends/BackendRegistry.cpp | 3 +- src/graph/backends/CL/CLDeviceBackend.cpp | 37 +- src/graph/backends/CL/CLFunctionsFactory.cpp | 151 +- src/graph/backends/CL/CLNodeValidator.cpp | 60 +- src/graph/backends/CL/CLSubTensorHandle.cpp | 7 +- src/graph/backends/CL/CLTensorHandle.cpp | 9 +- src/graph/backends/NEON/NEDeviceBackend.cpp | 29 +- src/graph/backends/NEON/NEFunctionFactory.cpp | 119 +- src/graph/backends/NEON/NENodeValidator.cpp | 64 +- src/graph/backends/NEON/NESubTensorHandle.cpp | 7 +- src/graph/backends/NEON/NETensorHandle.cpp | 10 +- .../detail/CrossLayerMemoryManagerHelpers.cpp | 59 +- src/graph/detail/ExecutionHelpers.cpp | 87 +- src/graph/frontend/Stream.cpp | 5 +- src/graph/frontend/SubStream.cpp | 5 +- src/graph/mutators/DepthConcatSubTensorMutator.cpp | 39 +- src/graph/mutators/GroupedConvolutionMutator.cpp | 77 +- src/graph/mutators/InPlaceOperationMutator.cpp | 105 +- src/graph/mutators/MutatorUtils.cpp | 8 +- src/graph/mutators/NodeExecutionMethodMutator.cpp | 42 +- src/graph/mutators/NodeFusionMutator.cpp | 169 +- src/graph/mutators/SplitLayerSubTensorMutator.cpp | 33 +- src/graph/mutators/SyntheticDataTypeMutator.cpp | 72 +- src/graph/nodes/ActivationLayerNode.cpp | 4 +- src/graph/nodes/ArgMinMaxLayerNode.cpp | 17 +- src/graph/nodes/BatchNormalizationLayerNode.cpp | 4 +- src/graph/nodes/BoundingBoxTransformLayerNode.cpp | 8 +- src/graph/nodes/ChannelShuffleLayerNode.cpp | 7 +- src/graph/nodes/ConcatenateLayerNode.cpp | 22 +- src/graph/nodes/ConstNode.cpp | 5 +- src/graph/nodes/ConvolutionLayerNode.cpp | 17 +- src/graph/nodes/DeconvolutionLayerNode.cpp | 10 +- src/graph/nodes/DepthToSpaceLayerNode.cpp | 11 +- src/graph/nodes/DepthwiseConvolutionLayerNode.cpp | 24 +- src/graph/nodes/DequantizationLayerNode.cpp | 4 +- src/graph/nodes/DetectionOutputLayerNode.cpp | 9 +- src/graph/nodes/DetectionPostProcessLayerNode.cpp | 11 +- src/graph/nodes/DummyNode.cpp | 7 +- src/graph/nodes/EltwiseLayerNode.cpp | 11 +- src/graph/nodes/FlattenLayerNode.cpp | 4 +- src/graph/nodes/FullyConnectedLayer.cpp | 29 +- .../FusedConvolutionBatchNormalizationNode.cpp | 29 +- ...dDepthwiseConvolutionBatchNormalizationNode.cpp | 40 +- src/graph/nodes/GenerateProposalsLayerNode.cpp | 14 +- src/graph/nodes/InputNode.cpp | 5 +- src/graph/nodes/L2NormalizeLayerNode.cpp | 13 +- src/graph/nodes/NormalizationLayerNode.cpp | 7 +- src/graph/nodes/NormalizePlanarYUVLayerNode.cpp | 2 +- src/graph/nodes/PReluLayerNode.cpp | 2 +- src/graph/nodes/PadLayerNode.cpp | 10 +- src/graph/nodes/PermuteLayerNode.cpp | 12 +- src/graph/nodes/PoolingLayerNode.cpp | 10 +- src/graph/nodes/PrintLayerNode.cpp | 8 +- src/graph/nodes/PriorBoxLayerNode.cpp | 7 +- src/graph/nodes/QuantizationLayerNode.cpp | 2 +- src/graph/nodes/ROIAlignLayerNode.cpp | 10 +- src/graph/nodes/ReductionLayerNode.cpp | 9 +- src/graph/nodes/ReorgLayerNode.cpp | 13 +- src/graph/nodes/ReshapeLayer.cpp | 10 +- src/graph/nodes/ResizeLayerNode.cpp | 4 +- src/graph/nodes/SliceLayerNode.cpp | 10 +- src/graph/nodes/SoftmaxLayerNode.cpp | 7 +- src/graph/nodes/SplitLayerNode.cpp | 26 +- src/graph/nodes/StackLayerNode.cpp | 18 +- src/graph/nodes/StridedSliceLayerNode.cpp | 2 +- src/graph/printers/DotGraphPrinter.cpp | 16 +- src/runtime/Allocator.cpp | 2 +- src/runtime/BlobLifetimeManager.cpp | 30 +- src/runtime/BlobMemoryPool.cpp | 6 +- src/runtime/CL/CLBufferAllocator.cpp | 3 +- src/runtime/CL/CLGEMMHeuristicsHandle.cpp | 3 +- src/runtime/CL/CLHelpers.cpp | 41 +- src/runtime/CL/CLMemory.cpp | 12 +- src/runtime/CL/CLMemoryRegion.cpp | 26 +- src/runtime/CL/CLOperator.cpp | 5 +- src/runtime/CL/CLRuntimeContext.cpp | 6 +- src/runtime/CL/CLScheduler.cpp | 47 +- src/runtime/CL/CLSubTensor.cpp | 10 +- src/runtime/CL/CLTensorAllocator.cpp | 40 +- src/runtime/CL/CLTuner.cpp | 90 +- src/runtime/CL/ICLSimpleFunction.cpp | 5 +- src/runtime/CL/Utils.cpp | 16 +- src/runtime/CL/functions/CLActivationLayer.cpp | 22 +- src/runtime/CL/functions/CLArgMinMaxLayer.cpp | 63 +- .../CL/functions/CLBatchNormalizationLayer.cpp | 37 +- src/runtime/CL/functions/CLBatchToSpaceLayer.cpp | 30 +- src/runtime/CL/functions/CLBitwiseAnd.cpp | 10 +- src/runtime/CL/functions/CLBitwiseNot.cpp | 5 +- src/runtime/CL/functions/CLBitwiseOr.cpp | 10 +- src/runtime/CL/functions/CLBitwiseXor.cpp | 8 +- .../CL/functions/CLBoundingBoxTransform.cpp | 19 +- src/runtime/CL/functions/CLCast.cpp | 22 +- src/runtime/CL/functions/CLChannelShuffleLayer.cpp | 7 +- src/runtime/CL/functions/CLComparison.cpp | 37 +- src/runtime/CL/functions/CLConcatenateLayer.cpp | 28 +- src/runtime/CL/functions/CLConv3D.cpp | 39 +- .../functions/CLConvertFullyConnectedWeights.cpp | 32 +- src/runtime/CL/functions/CLConvolutionLayer.cpp | 102 +- src/runtime/CL/functions/CLCopy.cpp | 15 +- src/runtime/CL/functions/CLCrop.cpp | 48 +- src/runtime/CL/functions/CLCropResize.cpp | 194 +- src/runtime/CL/functions/CLDeconvolutionLayer.cpp | 74 +- .../CL/functions/CLDeconvolutionLayerUpsample.cpp | 17 +- src/runtime/CL/functions/CLDepthConvertLayer.cpp | 26 +- src/runtime/CL/functions/CLDepthToSpaceLayer.cpp | 8 +- .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 151 +- src/runtime/CL/functions/CLDequantizationLayer.cpp | 17 +- .../CL/functions/CLDirectConvolutionLayer.cpp | 46 +- .../CL/functions/CLDirectDeconvolutionLayer.cpp | 80 +- .../CL/functions/CLElementwiseOperations.cpp | 206 ++- .../CL/functions/CLElementwiseUnaryLayer.cpp | 78 +- src/runtime/CL/functions/CLFFT1D.cpp | 30 +- src/runtime/CL/functions/CLFFT2D.cpp | 16 +- src/runtime/CL/functions/CLFFTConvolutionLayer.cpp | 123 +- src/runtime/CL/functions/CLFill.cpp | 17 +- src/runtime/CL/functions/CLFlattenLayer.cpp | 24 +- src/runtime/CL/functions/CLFloor.cpp | 12 +- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 64 +- .../CL/functions/CLFuseBatchNormalization.cpp | 57 +- src/runtime/CL/functions/CLGEMM.cpp | 56 +- .../CL/functions/CLGEMMConvolutionLayer.cpp | 88 +- .../CL/functions/CLGEMMDeconvolutionLayer.cpp | 196 +- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 38 +- src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp | 34 +- src/runtime/CL/functions/CLGather.cpp | 8 +- .../CL/functions/CLGenerateProposalsLayer.cpp | 192 +- .../CL/functions/CLIndirectConvolutionLayer.cpp | 44 +- .../CL/functions/CLInstanceNormalizationLayer.cpp | 36 +- src/runtime/CL/functions/CLL2NormalizeLayer.cpp | 10 +- src/runtime/CL/functions/CLLSTMLayer.cpp | 575 ++++-- src/runtime/CL/functions/CLLSTMLayerQuantized.cpp | 410 ++-- src/runtime/CL/functions/CLLogicalAnd.cpp | 26 +- src/runtime/CL/functions/CLLogicalNot.cpp | 14 +- src/runtime/CL/functions/CLLogicalOr.cpp | 26 +- src/runtime/CL/functions/CLMatMul.cpp | 29 +- src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp | 21 +- .../functions/CLMeanStdDevNormalizationLayer.cpp | 7 +- src/runtime/CL/functions/CLNormalizationLayer.cpp | 20 +- .../CL/functions/CLNormalizePlanarYUVLayer.cpp | 20 +- src/runtime/CL/functions/CLPReluLayer.cpp | 23 +- src/runtime/CL/functions/CLPadLayer.cpp | 41 +- src/runtime/CL/functions/CLPermute.cpp | 18 +- .../CL/functions/CLPixelWiseMultiplication.cpp | 82 +- src/runtime/CL/functions/CLPooling3dLayer.cpp | 20 +- src/runtime/CL/functions/CLPoolingLayer.cpp | 31 +- src/runtime/CL/functions/CLPriorBoxLayer.cpp | 34 +- src/runtime/CL/functions/CLQLSTMLayer.cpp | 942 ++++++---- src/runtime/CL/functions/CLQuantizationLayer.cpp | 10 +- src/runtime/CL/functions/CLRNNLayer.cpp | 53 +- src/runtime/CL/functions/CLROIAlignLayer.cpp | 20 +- src/runtime/CL/functions/CLROIPoolingLayer.cpp | 19 +- src/runtime/CL/functions/CLRange.cpp | 5 +- src/runtime/CL/functions/CLReduceMean.cpp | 90 +- src/runtime/CL/functions/CLReductionOperation.cpp | 65 +- src/runtime/CL/functions/CLReorgLayer.cpp | 7 +- src/runtime/CL/functions/CLReshapeLayer.cpp | 14 +- src/runtime/CL/functions/CLReverse.cpp | 7 +- src/runtime/CL/functions/CLScale.cpp | 15 +- src/runtime/CL/functions/CLSelect.cpp | 8 +- src/runtime/CL/functions/CLSlice.cpp | 41 +- src/runtime/CL/functions/CLSoftmaxLayer.cpp | 22 +- src/runtime/CL/functions/CLSpaceToBatchLayer.cpp | 71 +- src/runtime/CL/functions/CLSpaceToDepthLayer.cpp | 10 +- src/runtime/CL/functions/CLSplit.cpp | 3 +- src/runtime/CL/functions/CLStackLayer.cpp | 21 +- src/runtime/CL/functions/CLStridedSlice.cpp | 81 +- src/runtime/CL/functions/CLTile.cpp | 8 +- src/runtime/CL/functions/CLTranspose.cpp | 12 +- src/runtime/CL/functions/CLUnstack.cpp | 38 +- .../CL/functions/CLWinogradConvolutionLayer.cpp | 65 +- src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp | 240 +-- src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp | 34 +- src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp | 169 +- src/runtime/CL/gemm/CLGEMMKernelSelection.h | 3 +- .../gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp | 74 +- .../CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h | 7 +- src/runtime/CL/mlgo/Common.h | 40 +- src/runtime/CL/mlgo/HeuristicTree.cpp | 89 +- src/runtime/CL/mlgo/HeuristicTree.h | 24 +- src/runtime/CL/mlgo/MLGOHeuristics.cpp | 99 +- src/runtime/CL/mlgo/MLGOHeuristics.h | 6 +- src/runtime/CL/mlgo/MLGOParser.cpp | 188 +- src/runtime/CL/mlgo/MLGOParser.h | 9 +- src/runtime/CL/mlgo/Utils.cpp | 48 +- src/runtime/CL/mlgo/Utils.h | 10 +- src/runtime/CL/tuners/CLTuningParametersList.cpp | 50 +- src/runtime/CPP/CPPScheduler.cpp | 94 +- src/runtime/CPP/SingleThreadScheduler.cpp | 11 +- .../CPPBoxWithNonMaximaSuppressionLimit.cpp | 157 +- .../CPP/functions/CPPDetectionOutputLayer.cpp | 312 ++-- .../CPP/functions/CPPDetectionPostProcessLayer.cpp | 414 +++-- .../CPP/functions/CPPNonMaximumSuppression.cpp | 21 +- src/runtime/CPP/functions/CPPTopKV.cpp | 5 +- src/runtime/IScheduler.cpp | 77 +- src/runtime/ISimpleLifetimeManager.cpp | 27 +- src/runtime/IWeightsManager.cpp | 45 +- src/runtime/Memory.cpp | 9 +- src/runtime/MemoryManagerOnDemand.cpp | 5 +- src/runtime/NEON/INEOperator.cpp | 7 +- src/runtime/NEON/INESimpleFunction.cpp | 4 +- src/runtime/NEON/INESimpleFunctionNoBorder.cpp | 5 +- src/runtime/NEON/functions/NEActivationLayer.cpp | 17 +- src/runtime/NEON/functions/NEAddMulAdd.cpp | 44 +- src/runtime/NEON/functions/NEArgMinMaxLayer.cpp | 15 +- .../NEON/functions/NEArithmeticAddition.cpp | 26 +- .../NEON/functions/NEArithmeticSubtraction.cpp | 26 +- .../NEON/functions/NEBatchNormalizationLayer.cpp | 25 +- src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp | 13 +- src/runtime/NEON/functions/NEBitwiseAnd.cpp | 3 +- src/runtime/NEON/functions/NEBitwiseNot.cpp | 3 +- src/runtime/NEON/functions/NEBitwiseOr.cpp | 3 +- src/runtime/NEON/functions/NEBitwiseXor.cpp | 3 +- .../NEON/functions/NEBoundingBoxTransform.cpp | 11 +- src/runtime/NEON/functions/NECast.cpp | 14 +- .../NEON/functions/NEChannelShuffleLayer.cpp | 1 + src/runtime/NEON/functions/NEConcatenateLayer.cpp | 30 +- src/runtime/NEON/functions/NEConv3D.cpp | 27 +- .../functions/NEConvertFullyConnectedWeights.cpp | 24 +- src/runtime/NEON/functions/NEConvolutionLayer.cpp | 84 +- src/runtime/NEON/functions/NECopy.cpp | 12 +- src/runtime/NEON/functions/NECropResize.cpp | 54 +- .../NEON/functions/NEDeconvolutionLayer.cpp | 116 +- src/runtime/NEON/functions/NEDepthConvertLayer.cpp | 17 +- src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp | 1 + .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 193 +- .../NEON/functions/NEDequantizationLayer.cpp | 10 +- .../NEON/functions/NEDetectionPostProcessLayer.cpp | 60 +- .../NEON/functions/NEDirectConvolutionLayer.cpp | 27 +- .../NEON/functions/NEElementwiseOperations.cpp | 152 +- .../NEON/functions/NEElementwiseUnaryLayer.cpp | 15 +- src/runtime/NEON/functions/NEFFT1D.cpp | 29 +- src/runtime/NEON/functions/NEFFT2D.cpp | 8 +- .../NEON/functions/NEFFTConvolutionLayer.cpp | 105 +- src/runtime/NEON/functions/NEFill.cpp | 10 +- src/runtime/NEON/functions/NEFillBorder.cpp | 9 +- src/runtime/NEON/functions/NEFlattenLayer.cpp | 22 +- src/runtime/NEON/functions/NEFloor.cpp | 12 +- .../NEON/functions/NEFullyConnectedLayer.cpp | 77 +- .../NEON/functions/NEFuseBatchNormalization.cpp | 42 +- src/runtime/NEON/functions/NEGEMM.cpp | 62 +- src/runtime/NEON/functions/NEGEMMConv2d.cpp | 39 +- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 76 +- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 61 +- .../NEON/functions/NEGEMMLowpOutputStage.cpp | 33 +- src/runtime/NEON/functions/NEGather.cpp | 3 +- .../NEON/functions/NEGenerateProposalsLayer.cpp | 187 +- .../functions/NEInstanceNormalizationLayer.cpp | 26 +- src/runtime/NEON/functions/NEL2NormalizeLayer.cpp | 4 +- src/runtime/NEON/functions/NELSTMLayer.cpp | 510 +++-- .../NEON/functions/NELSTMLayerQuantized.cpp | 383 ++-- src/runtime/NEON/functions/NELogical.cpp | 12 +- src/runtime/NEON/functions/NEMatMul.cpp | 28 +- src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp | 24 +- .../functions/NEMeanStdDevNormalizationLayer.cpp | 3 +- .../NEON/functions/NENormalizationLayer.cpp | 10 +- src/runtime/NEON/functions/NEPReluLayer.cpp | 14 +- src/runtime/NEON/functions/NEPadLayer.cpp | 90 +- src/runtime/NEON/functions/NEPermute.cpp | 10 +- .../NEON/functions/NEPixelWiseMultiplication.cpp | 50 +- src/runtime/NEON/functions/NEPooling3dLayer.cpp | 17 +- src/runtime/NEON/functions/NEPoolingLayer.cpp | 21 +- src/runtime/NEON/functions/NEPriorBoxLayer.cpp | 13 +- src/runtime/NEON/functions/NEQLSTMLayer.cpp | 1082 ++++++----- src/runtime/NEON/functions/NEQuantizationLayer.cpp | 10 +- src/runtime/NEON/functions/NERNNLayer.cpp | 44 +- src/runtime/NEON/functions/NEROIAlignLayer.cpp | 10 +- src/runtime/NEON/functions/NEROIPoolingLayer.cpp | 17 +- src/runtime/NEON/functions/NERange.cpp | 6 +- src/runtime/NEON/functions/NEReduceMean.cpp | 55 +- .../NEON/functions/NEReductionOperation.cpp | 75 +- src/runtime/NEON/functions/NEReorderLayer.cpp | 19 +- src/runtime/NEON/functions/NEReorgLayer.cpp | 3 +- src/runtime/NEON/functions/NEReshapeLayer.cpp | 12 +- src/runtime/NEON/functions/NEReverse.cpp | 8 +- src/runtime/NEON/functions/NEScale.cpp | 48 +- src/runtime/NEON/functions/NESelect.cpp | 1 + src/runtime/NEON/functions/NESlice.cpp | 35 +- src/runtime/NEON/functions/NESoftmaxLayer.cpp | 21 +- src/runtime/NEON/functions/NESpaceToBatchLayer.cpp | 38 +- src/runtime/NEON/functions/NESpaceToDepthLayer.cpp | 4 +- src/runtime/NEON/functions/NESplit.cpp | 2 +- src/runtime/NEON/functions/NEStackLayer.cpp | 11 +- src/runtime/NEON/functions/NEStridedSlice.cpp | 59 +- src/runtime/NEON/functions/NETile.cpp | 3 +- src/runtime/NEON/functions/NETranspose.cpp | 10 +- src/runtime/NEON/functions/NEUnstack.cpp | 34 +- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 44 +- src/runtime/OMP/OMPScheduler.cpp | 14 +- src/runtime/OffsetLifetimeManager.cpp | 20 +- src/runtime/OffsetMemoryPool.cpp | 8 +- src/runtime/OperatorTensor.cpp | 3 +- src/runtime/PoolManager.cpp | 11 +- src/runtime/RuntimeContext.cpp | 3 +- src/runtime/Scheduler.cpp | 13 +- src/runtime/SchedulerFactory.cpp | 2 +- src/runtime/SchedulerUtils.cpp | 19 +- src/runtime/SubTensor.cpp | 3 +- src/runtime/Tensor.cpp | 3 +- src/runtime/TensorAllocator.cpp | 19 +- src/runtime/Utils.cpp | 15 +- .../ClDirectConvDefaultConfigBifrost.cpp | 67 +- .../direct_conv/ClDirectConvDefaultConfigBifrost.h | 20 +- .../ClDirectConvDefaultConfigValhall.cpp | 125 +- .../direct_conv/ClDirectConvDefaultConfigValhall.h | 20 +- .../direct_conv/ClDirectConvKernelConfig.h | 4 +- .../direct_conv/IClDirectConvKernelConfig.h | 14 +- .../dwc_native/ClDWCNativeDefaultConfigBifrost.cpp | 133 +- .../dwc_native/ClDWCNativeDefaultConfigBifrost.h | 42 +- .../dwc_native/ClDWCNativeDefaultConfigValhall.cpp | 127 +- .../dwc_native/ClDWCNativeDefaultConfigValhall.h | 35 +- .../dwc_native/ClDWCNativeHeuristicsHelpers.cpp | 8 +- .../dwc_native/ClDWCNativeKernelConfig.h | 2 +- .../dwc_native/IClDWCNativeKernelConfig.h | 16 +- .../ClIndirectConvDefaultConfigValhall.cpp | 60 +- .../ClIndirectConvDefaultConfigValhall.h | 9 +- .../indirect_conv/ClIndirectConvKernelConfig.h | 2 +- .../indirect_conv/IClIndirectConvKernelConfig.h | 12 +- .../ClMatMulNativeDefaultConfigValhall.cpp | 396 ++-- .../ClMatMulNativeDefaultConfigValhall.h | 11 +- .../matmul_native/ClMatMulNativeHelpers.cpp | 42 +- .../matmul_native/ClMatMulNativeHelpers.h | 15 +- .../matmul_native/ClMatMulNativeKernelConfig.h | 4 +- .../matmul_native/IClMatMulNativeKernelConfig.h | 9 +- 1334 files changed, 49885 insertions(+), 37971 deletions(-) (limited to 'src') diff --git a/src/c/AclContext.cpp b/src/c/AclContext.cpp index 9b8ffea619..c6c0820c92 100644 --- a/src/c/AclContext.cpp +++ b/src/c/AclContext.cpp @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "arm_compute/AclEntrypoints.h" - #include "arm_compute/core/Error.h" #include "src/common/IContext.h" @@ -42,25 +41,25 @@ namespace template arm_compute::IContext *create_backend_ctx(const AclContextOptions *options) { - return new(std::nothrow) ContextType(options); + return new (std::nothrow) ContextType(options); } bool is_target_valid(AclTarget target) { - return arm_compute::utils::is_in(target, { AclCpu, AclGpuOcl }); + return arm_compute::utils::is_in(target, {AclCpu, AclGpuOcl}); } bool are_context_options_valid(const AclContextOptions *options) { ARM_COMPUTE_ASSERT_NOT_NULLPTR(options); - return arm_compute::utils::is_in(options->mode, { AclPreferFastRerun, AclPreferFastStart }); + return arm_compute::utils::is_in(options->mode, {AclPreferFastRerun, AclPreferFastStart}); } arm_compute::IContext *create_context(AclTarget target, const AclContextOptions *options) { ARM_COMPUTE_UNUSED(options); - switch(target) + switch (target) { #ifdef ARM_COMPUTE_CPU_ENABLED case AclCpu: @@ -77,24 +76,22 @@ arm_compute::IContext *create_context(AclTarget target, const AclContextOptions } } // namespace -extern "C" AclStatus AclCreateContext(AclContext *external_ctx, - AclTarget target, - const AclContextOptions *options) +extern "C" AclStatus AclCreateContext(AclContext *external_ctx, AclTarget target, const AclContextOptions *options) { - if(!is_target_valid(target)) + if (!is_target_valid(target)) { ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Target is invalid!"); return AclUnsupportedTarget; } - if(options != nullptr && !are_context_options_valid(options)) + if (options != nullptr && !are_context_options_valid(options)) { ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Context options are invalid!"); return AclInvalidArgument; } auto ctx = create_context(target, options); - if(ctx == nullptr) + if (ctx == nullptr) { ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Couldn't allocate internal resources for context creation!"); return AclOutOfMemory; @@ -113,7 +110,7 @@ extern "C" AclStatus AclDestroyContext(AclContext external_ctx) StatusCode status = detail::validate_internal_context(ctx); ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status); - if(ctx->refcount() != 0) + if (ctx->refcount() != 0) { ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Context has references on it that haven't been released!"); // TODO: Fix the refcount with callback when reaches 0 diff --git a/src/c/AclQueue.cpp b/src/c/AclQueue.cpp index 020c6ed531..c3e867bffc 100644 --- a/src/c/AclQueue.cpp +++ b/src/c/AclQueue.cpp @@ -38,7 +38,7 @@ namespace bool is_mode_valid(const AclQueueOptions *options) { ARM_COMPUTE_ASSERT_NOT_NULLPTR(options); - return arm_compute::utils::is_in(options->mode, { AclTuningModeNone, AclRapid, AclNormal, AclExhaustive }); + return arm_compute::utils::is_in(options->mode, {AclTuningModeNone, AclRapid, AclNormal, AclExhaustive}); } } // namespace @@ -51,14 +51,14 @@ extern "C" AclStatus AclCreateQueue(AclQueue *external_queue, AclContext externa StatusCode status = detail::validate_internal_context(ctx); ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status); - if(options != nullptr && !is_mode_valid(options)) + if (options != nullptr && !is_mode_valid(options)) { ARM_COMPUTE_LOG_ERROR_ACL("Queue options are invalid"); return AclInvalidArgument; } auto queue = ctx->create_queue(options); - if(queue == nullptr) + if (queue == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); return AclOutOfMemory; diff --git a/src/c/AclTensor.cpp b/src/c/AclTensor.cpp index 5b184697aa..c4cd08ac70 100644 --- a/src/c/AclTensor.cpp +++ b/src/c/AclTensor.cpp @@ -24,6 +24,7 @@ #include "arm_compute/AclEntrypoints.h" #include "arm_compute/AclUtils.h" #include "arm_compute/core/Error.h" + #include "src/common/ITensorV2.h" #include "src/common/utils/Macros.h" @@ -41,17 +42,17 @@ constexpr int32_t max_allowed_dims = 6; */ bool is_desc_valid(const AclTensorDescriptor &desc) { - if(desc.data_type > AclFloat32 || desc.data_type <= AclDataTypeUnknown) + if (desc.data_type > AclFloat32 || desc.data_type <= AclDataTypeUnknown) { ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Unknown data type!"); return false; } - if(desc.ndims > max_allowed_dims) + if (desc.ndims > max_allowed_dims) { ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Dimensions surpass the maximum allowed value!"); return false; } - if(desc.ndims > 0 && desc.shape == nullptr) + if (desc.ndims > 0 && desc.shape == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Dimensions values are empty while dimensionality is > 0!"); return false; @@ -66,10 +67,8 @@ StatusCode convert_and_validate_tensor(AclTensor tensor, ITensorV2 **internal_te } } // namespace -extern "C" AclStatus AclCreateTensor(AclTensor *external_tensor, - AclContext external_ctx, - const AclTensorDescriptor *desc, - bool allocate) +extern "C" AclStatus +AclCreateTensor(AclTensor *external_tensor, AclContext external_ctx, const AclTensorDescriptor *desc, bool allocate) { using namespace arm_compute; @@ -78,14 +77,14 @@ extern "C" AclStatus AclCreateTensor(AclTensor *external_tensor, StatusCode status = detail::validate_internal_context(ctx); ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status); - if(desc == nullptr || !is_desc_valid(*desc)) + if (desc == nullptr || !is_desc_valid(*desc)) { ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Descriptor is invalid!"); return AclInvalidArgument; } auto tensor = ctx->create_tensor(*desc, allocate); - if(tensor == nullptr) + if (tensor == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Couldn't allocate internal resources for tensor creation!"); return AclOutOfMemory; @@ -103,7 +102,7 @@ extern "C" AclStatus AclMapTensor(AclTensor external_tensor, void **handle) StatusCode status = detail::validate_internal_tensor(tensor); ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status); - if(handle == nullptr) + if (handle == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("[AclMapTensor]: Handle object is nullptr!"); return AclInvalidArgument; @@ -160,12 +159,12 @@ extern "C" AclStatus AclGetTensorSize(AclTensor tensor, uint64_t *size) { using namespace arm_compute; - if(size == nullptr) + if (size == nullptr) { return AclStatus::AclInvalidArgument; } - ITensorV2 *internal_tensor{ nullptr }; + ITensorV2 *internal_tensor{nullptr}; auto status = convert_and_validate_tensor(tensor, &internal_tensor); ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status); @@ -177,15 +176,15 @@ extern "C" AclStatus AclGetTensorDescriptor(AclTensor tensor, AclTensorDescripto { using namespace arm_compute; - if(desc == nullptr) + if (desc == nullptr) { return AclStatus::AclInvalidArgument; } - ITensorV2 *internal_tensor{ nullptr }; + ITensorV2 *internal_tensor{nullptr}; const auto status = convert_and_validate_tensor(tensor, &internal_tensor); ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status); *desc = internal_tensor->get_descriptor(); return utils::as_cenum(status); -} \ No newline at end of file +} diff --git a/src/c/AclTensorPack.cpp b/src/c/AclTensorPack.cpp index 6202524ca7..daf1be4f44 100644 --- a/src/c/AclTensorPack.cpp +++ b/src/c/AclTensorPack.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/AclEntrypoints.h" + #include "src/common/ITensorV2.h" #include "src/common/TensorPack.h" #include "src/common/utils/Macros.h" @@ -36,7 +37,7 @@ StatusCode PackTensorInternal(TensorPack &pack, AclTensor external_tensor, int32 status = detail::validate_internal_tensor(tensor); - if(status != StatusCode::Success) + if (status != StatusCode::Success) { return status; } @@ -57,7 +58,7 @@ extern "C" AclStatus AclCreateTensorPack(AclTensorPack *external_pack, AclContex ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status); auto pack = new TensorPack(ctx); - if(pack == nullptr) + if (pack == nullptr) { ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Couldn't allocate internal resources!"); return AclOutOfMemory; @@ -77,14 +78,15 @@ extern "C" AclStatus AclPackTensor(AclTensorPack external_pack, AclTensor extern return AclStatus::AclSuccess; } -extern "C" AclStatus AclPackTensors(AclTensorPack external_pack, AclTensor *external_tensors, int32_t *slot_ids, size_t num_tensors) +extern "C" AclStatus +AclPackTensors(AclTensorPack external_pack, AclTensor *external_tensors, int32_t *slot_ids, size_t num_tensors) { using namespace arm_compute; auto pack = get_internal(external_pack); ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(detail::validate_internal_pack(pack)); - for(unsigned i = 0; i < num_tensors; ++i) + for (unsigned i = 0; i < num_tensors; ++i) { ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(PackTensorInternal(*pack, external_tensors[i], slot_ids[i])); } diff --git a/src/c/AclVersion.cpp b/src/c/AclVersion.cpp index 971189a6d4..a659e90837 100644 --- a/src/c/AclVersion.cpp +++ b/src/c/AclVersion.cpp @@ -25,8 +25,7 @@ namespace { -constexpr AclVersion version_info -{ +constexpr AclVersion version_info{ ARM_COMPUTE_LIBRARY_VERSION_MAJOR, ARM_COMPUTE_LIBRARY_VERSION_MINOR, ARM_COMPUTE_LIBRARY_VERSION_PATCH, diff --git a/src/c/cl/AclOpenClExt.cpp b/src/c/cl/AclOpenClExt.cpp index e72babcae8..8e42cf5510 100644 --- a/src/c/cl/AclOpenClExt.cpp +++ b/src/c/cl/AclOpenClExt.cpp @@ -23,13 +23,12 @@ */ #include "arm_compute/AclOpenClExt.h" +#include "arm_compute/core/CL/ICLTensor.h" + #include "src/common/ITensorV2.h" #include "src/common/Types.h" #include "src/gpu/cl/ClContext.h" #include "src/gpu/cl/ClQueue.h" - -#include "arm_compute/core/CL/ICLTensor.h" - #include "support/Cast.h" extern "C" AclStatus AclGetClContext(AclContext external_ctx, cl_context *opencl_context) @@ -37,17 +36,17 @@ extern "C" AclStatus AclGetClContext(AclContext external_ctx, cl_context *opencl using namespace arm_compute; IContext *ctx = get_internal(external_ctx); - if(detail::validate_internal_context(ctx) != StatusCode::Success) + if (detail::validate_internal_context(ctx) != StatusCode::Success) { return AclStatus::AclInvalidArgument; } - if(ctx->type() != Target::GpuOcl) + if (ctx->type() != Target::GpuOcl) { return AclStatus::AclInvalidTarget; } - if(opencl_context == nullptr) + if (opencl_context == nullptr) { return AclStatus::AclInvalidArgument; } @@ -62,23 +61,23 @@ extern "C" AclStatus AclSetClContext(AclContext external_ctx, cl_context opencl_ using namespace arm_compute; IContext *ctx = get_internal(external_ctx); - if(detail::validate_internal_context(ctx) != StatusCode::Success) + if (detail::validate_internal_context(ctx) != StatusCode::Success) { return AclStatus::AclInvalidArgument; } - if(ctx->type() != Target::GpuOcl) + if (ctx->type() != Target::GpuOcl) { return AclStatus::AclInvalidTarget; } - if(ctx->refcount() != 0) + if (ctx->refcount() != 0) { return AclStatus::AclUnsupportedConfig; } auto cl_ctx = utils::cast::polymorphic_downcast(ctx); - if(!cl_ctx->set_cl_ctx(::cl::Context(opencl_context))) + if (!cl_ctx->set_cl_ctx(::cl::Context(opencl_context))) { return AclStatus::AclRuntimeError; } @@ -91,17 +90,17 @@ extern "C" AclStatus AclGetClDevice(AclContext external_ctx, cl_device_id *openc using namespace arm_compute; IContext *ctx = get_internal(external_ctx); - if(detail::validate_internal_context(ctx) != StatusCode::Success) + if (detail::validate_internal_context(ctx) != StatusCode::Success) { return AclStatus::AclInvalidArgument; } - if(ctx->type() != Target::GpuOcl) + if (ctx->type() != Target::GpuOcl) { return AclStatus::AclInvalidTarget; } - if(opencl_device == nullptr) + if (opencl_device == nullptr) { return AclStatus::AclInvalidArgument; } @@ -116,17 +115,17 @@ extern "C" AclStatus AclGetClQueue(AclQueue external_queue, cl_command_queue *op using namespace arm_compute; IQueue *queue = get_internal(external_queue); - if(detail::validate_internal_queue(queue) != StatusCode::Success) + if (detail::validate_internal_queue(queue) != StatusCode::Success) { return AclStatus::AclInvalidArgument; } - if(queue->header.ctx->type() != Target::GpuOcl) + if (queue->header.ctx->type() != Target::GpuOcl) { return AclStatus::AclInvalidTarget; } - if(opencl_queue == nullptr) + if (opencl_queue == nullptr) { return AclStatus::AclInvalidArgument; } @@ -141,18 +140,18 @@ extern "C" AclStatus AclSetClQueue(AclQueue external_queue, cl_command_queue ope using namespace arm_compute; IQueue *queue = get_internal(external_queue); - if(detail::validate_internal_queue(queue) != StatusCode::Success) + if (detail::validate_internal_queue(queue) != StatusCode::Success) { return AclStatus::AclInvalidArgument; } - if(queue->header.ctx->type() != Target::GpuOcl) + if (queue->header.ctx->type() != Target::GpuOcl) { return AclStatus::AclInvalidTarget; } auto cl_queue = utils::cast::polymorphic_downcast(queue); - if(!cl_queue->set_cl_queue(::cl::CommandQueue(opencl_queue))) + if (!cl_queue->set_cl_queue(::cl::CommandQueue(opencl_queue))) { return AclStatus::AclRuntimeError; } @@ -165,17 +164,17 @@ extern "C" AclStatus AclGetClMem(AclTensor external_tensor, cl_mem *opencl_mem) using namespace arm_compute; ITensorV2 *tensor = get_internal(external_tensor); - if(detail::validate_internal_tensor(tensor) != StatusCode::Success) + if (detail::validate_internal_tensor(tensor) != StatusCode::Success) { return AclStatus::AclInvalidArgument; } - if(tensor->header.ctx->type() != Target::GpuOcl) + if (tensor->header.ctx->type() != Target::GpuOcl) { return AclStatus::AclInvalidTarget; } - if(opencl_mem == nullptr) + if (opencl_mem == nullptr) { return AclStatus::AclInvalidArgument; } @@ -184,4 +183,4 @@ extern "C" AclStatus AclGetClMem(AclTensor external_tensor, cl_mem *opencl_mem) *opencl_mem = cl_tensor->cl_buffer().get(); return AclStatus::AclSuccess; -} \ No newline at end of file +} diff --git a/src/common/AllocatorWrapper.cpp b/src/common/AllocatorWrapper.cpp index 7b5bb34433..28d81a9fa4 100644 --- a/src/common/AllocatorWrapper.cpp +++ b/src/common/AllocatorWrapper.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/common/AllocatorWrapper.h" + #include "arm_compute/core/Error.h" namespace arm_compute @@ -57,7 +58,7 @@ void AllocatorWrapper::aligned_free(void *ptr) void AllocatorWrapper::set_user_data(void *user_data) { - if(user_data != nullptr) + if (user_data != nullptr) { _backing_allocator.user_data = user_data; } diff --git a/src/common/AllocatorWrapper.h b/src/common/AllocatorWrapper.h index 5e1f138f16..bbf70a2cb1 100644 --- a/src/common/AllocatorWrapper.h +++ b/src/common/AllocatorWrapper.h @@ -37,8 +37,8 @@ public: * @param[in] backing_allocator Backing memory allocator to be used */ AllocatorWrapper(const AclAllocator &backing_allocator) noexcept; - AllocatorWrapper(const AllocatorWrapper &) noexcept = default; - AllocatorWrapper(AllocatorWrapper &&) noexcept = default; + AllocatorWrapper(const AllocatorWrapper &) noexcept = default; + AllocatorWrapper(AllocatorWrapper &&) noexcept = default; AllocatorWrapper &operator=(const AllocatorWrapper &) noexcept = delete; AllocatorWrapper &operator=(AllocatorWrapper &&other) noexcept = default; /** Allocate a chunk of memory of a given size in bytes @@ -78,4 +78,4 @@ private: }; } // namespace arm_compute -#endif /* SRC_COMMON_ALLOCATORWRAPPER_H */ \ No newline at end of file +#endif /* SRC_COMMON_ALLOCATORWRAPPER_H */ diff --git a/src/common/IContext.h b/src/common/IContext.h index 65bb76744d..a221e5db61 100644 --- a/src/common/IContext.h +++ b/src/common/IContext.h @@ -33,7 +33,7 @@ struct AclContext_ { - arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Context, nullptr }; + arm_compute::detail::Header header{arm_compute::detail::ObjectType::Context, nullptr}; protected: AclContext_() = default; @@ -51,8 +51,7 @@ class IOperator; class IContext : public AclContext_ { public: - IContext(Target target) - : AclContext_(), _target(target), _refcount(0) + IContext(Target target) : AclContext_(), _target(target), _refcount(0) { } /** Virtual Destructor */ @@ -108,11 +107,11 @@ public: * * @return A pointer to the created queue object */ - virtual IQueue *create_queue(const AclQueueOptions *options) = 0; - virtual std::tuple create_activation(const AclTensorDescriptor &src, + virtual IQueue *create_queue(const AclQueueOptions *options) = 0; + virtual std::tuple create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, - bool is_validate) = 0; + bool is_validate) = 0; private: Target _target; /**< Target type of context */ @@ -140,7 +139,7 @@ namespace detail */ inline StatusCode validate_internal_context(const IContext *ctx) { - if(ctx == nullptr || !ctx->is_valid()) + if (ctx == nullptr || !ctx->is_valid()) { ARM_COMPUTE_LOG_ERROR_ACL("Invalid context object"); return StatusCode::InvalidArgument; diff --git a/src/common/IOperator.cpp b/src/common/IOperator.cpp index b56f0e97fb..90e3473814 100644 --- a/src/common/IOperator.cpp +++ b/src/common/IOperator.cpp @@ -22,13 +22,13 @@ * SOFTWARE. */ #include "src/common/IOperator.h" + #include "src/common/utils/Validate.h" namespace arm_compute { #ifndef DOXYGEN_SKIP_THIS -IOperator::IOperator(IContext *ctx) - : AclOperator_() +IOperator::IOperator(IContext *ctx) : AclOperator_() { ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx); this->header.ctx = ctx; diff --git a/src/common/IOperator.h b/src/common/IOperator.h index 1b65a09e0d..e86e11fe25 100644 --- a/src/common/IOperator.h +++ b/src/common/IOperator.h @@ -30,13 +30,14 @@ // TODO: Remove when all functions have been ported #include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/IOperator.h" + #include "src/common/utils/Validate.h" #include struct AclOperator_ { - arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Operator, nullptr }; + arm_compute::detail::Header header{arm_compute::detail::ObjectType::Operator, nullptr}; protected: AclOperator_() = default; @@ -100,7 +101,7 @@ public: } private: - std::unique_ptr _op{ nullptr }; + std::unique_ptr _op{nullptr}; }; /** Extract internal representation of an Operator @@ -124,7 +125,7 @@ namespace detail */ inline StatusCode validate_internal_operator(const IOperator *op) { - if(op == nullptr || !op->is_valid()) + if (op == nullptr || !op->is_valid()) { ARM_COMPUTE_LOG_ERROR_ACL("[IOperator]: Invalid operator object"); return StatusCode::InvalidArgument; diff --git a/src/common/IQueue.h b/src/common/IQueue.h index 6a0cbc75da..60745d206e 100644 --- a/src/common/IQueue.h +++ b/src/common/IQueue.h @@ -28,7 +28,7 @@ struct AclQueue_ { - arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Queue, nullptr }; + arm_compute::detail::Header header{arm_compute::detail::ObjectType::Queue, nullptr}; protected: AclQueue_() = default; @@ -88,7 +88,7 @@ namespace detail */ inline StatusCode validate_internal_queue(const IQueue *queue) { - if(queue == nullptr || !queue->is_valid()) + if (queue == nullptr || !queue->is_valid()) { ARM_COMPUTE_LOG_ERROR_ACL("[IQueue]: Invalid queue object"); return StatusCode::InvalidArgument; diff --git a/src/common/ITensorV2.cpp b/src/common/ITensorV2.cpp index 39bf1c6fb3..bf3d963926 100644 --- a/src/common/ITensorV2.cpp +++ b/src/common/ITensorV2.cpp @@ -22,7 +22,9 @@ * SOFTWARE. */ #include "src/common/ITensorV2.h" + #include "arm_compute/core/TensorInfo.h" + #include "src/common/utils/LegacySupport.h" namespace arm_compute @@ -36,4 +38,4 @@ AclTensorDescriptor ITensorV2::get_descriptor() const { return detail::convert_to_descriptor(*tensor()->info()); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/common/ITensorV2.h b/src/common/ITensorV2.h index 965aacea23..903bfad66a 100644 --- a/src/common/ITensorV2.h +++ b/src/common/ITensorV2.h @@ -29,7 +29,7 @@ struct AclTensor_ { - arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Tensor, nullptr }; + arm_compute::detail::Header header{arm_compute::detail::ObjectType::Tensor, nullptr}; protected: AclTensor_() = default; @@ -49,8 +49,7 @@ public: * * @param[in] ctx Context to be used by the operator */ - explicit ITensorV2(IContext *ctx) - : AclTensor_() + explicit ITensorV2(IContext *ctx) : AclTensor_() { ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx); this->header.ctx = ctx; @@ -128,7 +127,7 @@ namespace detail */ inline StatusCode validate_internal_tensor(const ITensorV2 *tensor) { - if(tensor == nullptr || !tensor->is_valid()) + if (tensor == nullptr || !tensor->is_valid()) { ARM_COMPUTE_LOG_ERROR_ACL("[ITensorV2]: Invalid tensor object"); return StatusCode::InvalidArgument; diff --git a/src/common/TensorPack.cpp b/src/common/TensorPack.cpp index 6c2c7f9622..b51fc0bdd8 100644 --- a/src/common/TensorPack.cpp +++ b/src/common/TensorPack.cpp @@ -22,13 +22,13 @@ * SOFTWARE. */ #include "src/common/TensorPack.h" + #include "src/common/ITensorV2.h" #include "src/common/utils/Validate.h" namespace arm_compute { -TensorPack::TensorPack(IContext *ctx) - : AclTensorPack_(), _pack() +TensorPack::TensorPack(IContext *ctx) : AclTensorPack_(), _pack() { ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx); this->header.ctx = ctx; diff --git a/src/common/TensorPack.h b/src/common/TensorPack.h index f330eee740..b3d1624dae 100644 --- a/src/common/TensorPack.h +++ b/src/common/TensorPack.h @@ -25,11 +25,12 @@ #define SRC_COMMON_ITENSORPACK_H_ #include "arm_compute/core/ITensorPack.h" + #include "src/common/IContext.h" struct AclTensorPack_ { - arm_compute::detail::Header header{ arm_compute::detail::ObjectType::TensorPack, nullptr }; + arm_compute::detail::Header header{arm_compute::detail::ObjectType::TensorPack, nullptr}; protected: AclTensorPack_() = default; @@ -118,7 +119,7 @@ namespace detail */ inline StatusCode validate_internal_pack(const TensorPack *pack) { - if(pack == nullptr || !pack->is_valid()) + if (pack == nullptr || !pack->is_valid()) { ARM_COMPUTE_LOG_ERROR_ACL("[TensorPack]: Invalid tensor pack object"); return StatusCode::InvalidArgument; diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp index cdcdea916c..23a477332a 100644 --- a/src/common/cpuinfo/CpuInfo.cpp +++ b/src/common/cpuinfo/CpuInfo.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Log.h" + #include "support/StringSupport.h" #include "support/ToolchainSupport.h" @@ -53,16 +54,16 @@ #endif /* defined(__APPLE__) && defined(__aarch64__)) */ #endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */ -#define ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID (1 << 11) -#define ARM_COMPUTE_GET_FEATURE_REG(var, freg) __asm __volatile("MRS %0, " #freg \ - : "=r"(var)) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID (1 << 11) +#define ARM_COMPUTE_GET_FEATURE_REG(var, freg) __asm __volatile("MRS %0, " #freg : "=r"(var)) namespace arm_compute { namespace cpuinfo { namespace { -#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) +#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) /** Extract MIDR using CPUID information that are exposed to user-space * * @param[in] max_num_cpus Maximum number of possible CPUs @@ -72,15 +73,15 @@ namespace std::vector midr_from_cpuid(uint32_t max_num_cpus) { std::vector cpus; - for(unsigned int i = 0; i < max_num_cpus; ++i) + for (unsigned int i = 0; i < max_num_cpus; ++i) { std::stringstream str; str << "/sys/devices/system/cpu/cpu" << i << "/regs/identification/midr_el1"; std::ifstream file(str.str(), std::ios::in); - if(file.is_open()) + if (file.is_open()) { std::string line; - if(bool(getline(file, line))) + if (bool(getline(file, line))) { cpus.emplace_back(support::cpp11::stoul(line, nullptr, support::cpp11::NumericBase::BASE_16)); } @@ -122,34 +123,35 @@ std::vector midr_from_proc_cpuinfo(int max_num_cpus) ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed."); std::ifstream file("/proc/cpuinfo", std::ios::in); - if(file.is_open()) + if (file.is_open()) { std::string line; int midr = 0; int curcpu = -1; - while(bool(getline(file, line))) + while (bool(getline(file, line))) { std::array match; ret_status = regexec(&proc_regex, line.c_str(), 2, match.data(), 0); - if(ret_status == 0) + if (ret_status == 0) { std::string id = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)); int newcpu = support::cpp11::stoi(id, nullptr); - if(curcpu >= 0 && midr == 0) + if (curcpu >= 0 && midr == 0) { // Matched a new CPU ID without any description of the previous one - looks like old format. return {}; } - if(curcpu >= 0 && curcpu < max_num_cpus) + if (curcpu >= 0 && curcpu < max_num_cpus) { cpus.emplace_back(midr); } else { - ARM_COMPUTE_LOG_INFO_MSG_CORE("Trying to populate a core id with id greater than the expected number of cores!"); + ARM_COMPUTE_LOG_INFO_MSG_CORE( + "Trying to populate a core id with id greater than the expected number of cores!"); } midr = 0; @@ -159,7 +161,7 @@ std::vector midr_from_proc_cpuinfo(int max_num_cpus) } ret_status = regexec(&imp_regex, line.c_str(), 2, match.data(), 0); - if(ret_status == 0) + if (ret_status == 0) { std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)); int impv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16); @@ -169,7 +171,7 @@ std::vector midr_from_proc_cpuinfo(int max_num_cpus) } ret_status = regexec(&var_regex, line.c_str(), 2, match.data(), 0); - if(ret_status == 0) + if (ret_status == 0) { std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)); int varv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16); @@ -179,7 +181,7 @@ std::vector midr_from_proc_cpuinfo(int max_num_cpus) } ret_status = regexec(&part_regex, line.c_str(), 2, match.data(), 0); - if(ret_status == 0) + if (ret_status == 0) { std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)); int partv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16); @@ -189,7 +191,7 @@ std::vector midr_from_proc_cpuinfo(int max_num_cpus) } ret_status = regexec(&rev_regex, line.c_str(), 2, match.data(), 0); - if(ret_status == 0) + if (ret_status == 0) { std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)); int regv = support::cpp11::stoi(subexp, nullptr); @@ -200,13 +202,14 @@ std::vector midr_from_proc_cpuinfo(int max_num_cpus) } } - if(curcpu >= 0 && curcpu < max_num_cpus) + if (curcpu >= 0 && curcpu < max_num_cpus) { cpus.emplace_back(midr); } else { - ARM_COMPUTE_LOG_INFO_MSG_CORE("Trying to populate a core id with id greater than the expected number of cores!"); + ARM_COMPUTE_LOG_INFO_MSG_CORE( + "Trying to populate a core id with id greater than the expected number of cores!"); } } @@ -231,11 +234,11 @@ int get_max_cpus() CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in); bool success = false; - if(CPUspresent.is_open()) + if (CPUspresent.is_open()) { std::string line; - if(bool(getline(CPUspresent, line))) + if (bool(getline(CPUspresent, line))) { /* The content of this file is a list of ranges or single values, e.g. * 0-5, or 1-3,5,7 or similar. As we are interested in the @@ -244,9 +247,9 @@ int get_max_cpus() */ auto startfrom = line.begin(); - for(auto i = line.begin(); i < line.end(); ++i) + for (auto i = line.begin(); i < line.end(); ++i) { - if(*i == '-' || *i == ',') + if (*i == '-' || *i == ',') { startfrom = i + 1; } @@ -260,13 +263,14 @@ int get_max_cpus() } // Return std::thread::hardware_concurrency() as a fallback. - if(!success) + if (!success) { max_cpus = std::thread::hardware_concurrency(); } return max_cpus; } -#elif defined(__aarch64__) && defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */ +#elif defined(__aarch64__) && \ + defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */ /** Query features through sysctlbyname * * @return int value queried @@ -278,46 +282,45 @@ int get_hw_capability(const std::string &cap) sysctlbyname(cap.c_str(), &result, &size, NULL, 0); return result; } -#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */ +#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */ #if defined(BARE_METAL) && defined(__aarch64__) uint64_t get_sve_feature_reg() { uint64_t svefr0 = 0; - __asm __volatile( - ".inst 0xd5380483 // mrs x3, ID_AA64ZFR0_EL1\n" - "MOV %0, X3" - : "=r"(svefr0) - : - : "x3"); + __asm __volatile(".inst 0xd5380483 // mrs x3, ID_AA64ZFR0_EL1\n" + "MOV %0, X3" + : "=r"(svefr0) + : + : "x3"); return svefr0; } #endif /* defined(BARE_METAL) && defined(__aarch64__) */ } // namespace -CpuInfo::CpuInfo(CpuIsaInfo isa, std::vector cpus) - : _isa(std::move(isa)), _cpus(std::move(cpus)) +CpuInfo::CpuInfo(CpuIsaInfo isa, std::vector cpus) : _isa(std::move(isa)), _cpus(std::move(cpus)) { } CpuInfo CpuInfo::build() { -#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) +#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) const uint32_t hwcaps = getauxval(AT_HWCAP); const uint32_t hwcaps2 = getauxval(AT_HWCAP2); const uint32_t max_cpus = get_max_cpus(); // Populate midr values std::vector cpus_midr; - if(hwcaps & ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID) + if (hwcaps & ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID) { cpus_midr = midr_from_cpuid(max_cpus); } - if(cpus_midr.empty()) + if (cpus_midr.empty()) { cpus_midr = midr_from_proc_cpuinfo(max_cpus); } - if(cpus_midr.empty()) + if (cpus_midr.empty()) { cpus_midr.resize(max_cpus, 0); } @@ -333,7 +336,9 @@ CpuInfo CpuInfo::build() CpuInfo info(isa, cpus_model); return info; -#elif(BARE_METAL) && defined(__aarch64__) /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */ +#elif (BARE_METAL) && \ + defined( \ + __aarch64__) /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */ // Assume single CPU in bare metal mode. Just read the ID register and feature bits directly. uint64_t isar0 = 0, isar1 = 0, pfr0 = 0, pfr1 = 0, svefr0 = 0, midr = 0; @@ -342,7 +347,7 @@ CpuInfo CpuInfo::build() ARM_COMPUTE_GET_FEATURE_REG(pfr0, ID_AA64PFR0_EL1); ARM_COMPUTE_GET_FEATURE_REG(pfr1, ID_AA64PFR1_EL1); ARM_COMPUTE_GET_FEATURE_REG(midr, MIDR_EL1); - if((pfr0 >> 32) & 0xf) + if ((pfr0 >> 32) & 0xf) { svefr0 = get_sve_feature_reg(); } @@ -361,14 +366,14 @@ CpuInfo CpuInfo::build() CpuInfo info(isainfo, cpus_model); return info; #else /* #elif defined(__aarch64__) && defined(__APPLE__) */ - CpuInfo info(CpuIsaInfo(), { CpuModel::GENERIC }); + CpuInfo info(CpuIsaInfo(), {CpuModel::GENERIC}); return info; -#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */ +#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */ } CpuModel CpuInfo::cpu_model(uint32_t cpuid) const { - if(cpuid < _cpus.size()) + if (cpuid < _cpus.size()) { return _cpus[cpuid]; } @@ -377,9 +382,10 @@ CpuModel CpuInfo::cpu_model(uint32_t cpuid) const CpuModel CpuInfo::cpu_model() const { -#if defined(_WIN64) || defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) +#if defined(_WIN64) || defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || \ + (!defined(__arm__) && !defined(__aarch64__)) return cpu_model(0); -#else /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */ +#else /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */ return cpu_model(sched_getcpu()); #endif /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */ } @@ -406,13 +412,13 @@ uint32_t num_threads_hint() // Read cpuinfo and get occurrence of each core std::ifstream cpuinfo_file("/proc/cpuinfo", std::ios::in); - if(cpuinfo_file.is_open()) + if (cpuinfo_file.is_open()) { std::string line; - while(bool(getline(cpuinfo_file, line))) + while (bool(getline(cpuinfo_file, line))) { std::array match; - if(regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0) == 0) + if (regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0) == 0) { cpus.emplace_back(line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so))); } @@ -425,13 +431,13 @@ uint32_t num_threads_hint() auto least_frequent_cpu_occurences = [](const std::vector &cpus) -> uint32_t { std::unordered_map cpus_freq; - for(const auto &cpu : cpus) + for (const auto &cpu : cpus) { cpus_freq[cpu]++; } uint32_t vmin = cpus.size() + 1; - for(const auto &cpu_freq : cpus_freq) + for (const auto &cpu_freq : cpus_freq) { vmin = std::min(vmin, cpu_freq.second); } diff --git a/src/common/cpuinfo/CpuIsaInfo.cpp b/src/common/cpuinfo/CpuIsaInfo.cpp index 23da54a35d..597768530b 100644 --- a/src/common/cpuinfo/CpuIsaInfo.cpp +++ b/src/common/cpuinfo/CpuIsaInfo.cpp @@ -24,6 +24,7 @@ #include "src/common/cpuinfo/CpuIsaInfo.h" #include "arm_compute/core/Error.h" + #include "src/common/cpuinfo/CpuModel.h" /* Arm Feature flags */ @@ -31,18 +32,18 @@ #define ARM_COMPUTE_CPU_FEATURE_HWCAP_NEON (1 << 12) /* Arm64 Feature flags */ -#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMD (1 << 1) -#define ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP (1 << 9) -#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP (1 << 10) -#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDDP (1 << 20) -#define ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE (1 << 22) -#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2 (1 << 1) -#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEI8MM (1 << 9) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMD (1 << 1) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP (1 << 9) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP (1 << 10) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDDP (1 << 20) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE (1 << 22) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2 (1 << 1) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEI8MM (1 << 9) #define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEF32MM (1 << 10) -#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16 (1 << 12) -#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_I8MM (1 << 13) -#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16 (1 << 14) -#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME (1 << 23) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16 (1 << 12) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_I8MM (1 << 13) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16 (1 << 14) +#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME (1 << 23) namespace arm_compute { @@ -71,12 +72,12 @@ void decode_hwcaps(CpuIsaInfo &isa, const uint32_t hwcaps, const uint32_t hwcaps isa.sve2 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2); // Detection of SME from type HWCAP2 in the auxillary vector - isa.sme = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME); - isa.sme2 = isa.sme; // Needs to be set properly + isa.sme = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME); + isa.sme2 = isa.sme; // Needs to be set properly // Data-type support - isa.fp16 = is_feature_supported(hwcaps, ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP | ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP); - isa.bf16 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16); + isa.fp16 = is_feature_supported(hwcaps, ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP | ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP); + isa.bf16 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16); isa.svebf16 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16); // Instruction extensions @@ -92,12 +93,15 @@ void decode_hwcaps(CpuIsaInfo &isa, const uint32_t hwcaps, const uint32_t hwcaps } #endif /* defined(__aarch64__) */ -void decode_regs(CpuIsaInfo &isa, const uint64_t isar0, const uint64_t isar1, const uint64_t pfr0, const uint64_t pfr1, const uint64_t svefr0) +void decode_regs(CpuIsaInfo &isa, + const uint64_t isar0, + const uint64_t isar1, + const uint64_t pfr0, + const uint64_t pfr1, + const uint64_t svefr0) { auto is_supported = [](uint64_t feature_reg, uint8_t feature_pos) -> bool - { - return ((feature_reg >> feature_pos) & 0xf); - }; + { return ((feature_reg >> feature_pos) & 0xf); }; // High-level SIMD support isa.sve = is_supported(pfr0, 32); @@ -124,11 +128,11 @@ void decode_regs(CpuIsaInfo &isa, const uint64_t isar0, const uint64_t isar1, co */ void allowlisted_model_features(CpuIsaInfo &isa, CpuModel model) { - if(isa.dot == false) + if (isa.dot == false) { isa.dot = model_supports_dot(model); } - if(isa.fp16 == false) + if (isa.fp16 == false) { isa.fp16 = model_supports_fp16(model); } @@ -147,7 +151,8 @@ CpuIsaInfo init_cpu_isa_from_hwcaps(uint32_t hwcaps, uint32_t hwcaps2, uint32_t return isa; } -CpuIsaInfo init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr) +CpuIsaInfo +init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr) { CpuIsaInfo isa; diff --git a/src/common/cpuinfo/CpuIsaInfo.h b/src/common/cpuinfo/CpuIsaInfo.h index b92b6538b6..9d6bc07b67 100644 --- a/src/common/cpuinfo/CpuIsaInfo.h +++ b/src/common/cpuinfo/CpuIsaInfo.h @@ -37,22 +37,22 @@ namespace cpuinfo struct CpuIsaInfo { /* SIMD extension support */ - bool neon{ false }; - bool sve{ false }; - bool sve2{ false }; - bool sme{ false }; - bool sme2{ false }; + bool neon{false}; + bool sve{false}; + bool sve2{false}; + bool sme{false}; + bool sme2{false}; /* Data-type extensions support */ - bool fp16{ false }; - bool bf16{ false }; - bool svebf16{ false }; + bool fp16{false}; + bool bf16{false}; + bool svebf16{false}; /* Instruction support */ - bool dot{ false }; - bool i8mm{ false }; - bool svei8mm{ false }; - bool svef32mm{ false }; + bool dot{false}; + bool i8mm{false}; + bool svei8mm{false}; + bool svef32mm{false}; }; /** Identify ISA related information through system information @@ -76,7 +76,8 @@ CpuIsaInfo init_cpu_isa_from_hwcaps(uint32_t hwcaps, uint32_t hwcaps2, uint32_t * * @return CpuIsaInfo A populated ISA feature structure */ -CpuIsaInfo init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr); +CpuIsaInfo +init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr); } // namespace cpuinfo } // namespace arm_compute diff --git a/src/common/cpuinfo/CpuModel.cpp b/src/common/cpuinfo/CpuModel.cpp index d6d91df133..0455670302 100644 --- a/src/common/cpuinfo/CpuModel.cpp +++ b/src/common/cpuinfo/CpuModel.cpp @@ -29,12 +29,12 @@ namespace cpuinfo { std::string cpu_model_to_string(CpuModel model) { - switch(model) + switch (model) { #define X(MODEL) \ -case CpuModel::MODEL: \ - return #MODEL; - ARM_COMPUTE_CPU_MODEL_LIST + case CpuModel::MODEL: \ + return #MODEL; + ARM_COMPUTE_CPU_MODEL_LIST #undef X default: { @@ -45,7 +45,7 @@ case CpuModel::MODEL: \ bool model_supports_fp16(CpuModel model) { - switch(model) + switch (model) { case CpuModel::GENERIC_FP16: case CpuModel::GENERIC_FP16_DOT: @@ -63,7 +63,7 @@ bool model_supports_fp16(CpuModel model) bool model_supports_dot(CpuModel model) { - switch(model) + switch (model) { case CpuModel::GENERIC_FP16_DOT: case CpuModel::A55r1: @@ -87,16 +87,16 @@ CpuModel midr_to_model(uint32_t midr) const int cpunum = (midr >> 4) & 0xFFF; // Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC" - if(implementer == 0x41) // Arm CPUs + if (implementer == 0x41) // Arm CPUs { - switch(cpunum) + switch (cpunum) { case 0xd03: // A53 case 0xd04: // A35 model = CpuModel::A53; break; case 0xd05: // A55 - if(variant != 0) + if (variant != 0) { model = CpuModel::A55r1; } @@ -109,7 +109,7 @@ CpuModel midr_to_model(uint32_t midr) model = CpuModel::A73; break; case 0xd0a: // A75 - if(variant != 0) + if (variant != 0) { model = CpuModel::GENERIC_FP16_DOT; } @@ -144,9 +144,9 @@ CpuModel midr_to_model(uint32_t midr) break; } } - else if(implementer == 0x46) + else if (implementer == 0x46) { - switch(cpunum) + switch (cpunum) { case 0x001: // A64FX model = CpuModel::A64FX; @@ -156,9 +156,9 @@ CpuModel midr_to_model(uint32_t midr) break; } } - else if(implementer == 0x48) + else if (implementer == 0x48) { - switch(cpunum) + switch (cpunum) { case 0xd40: // A76 model = CpuModel::GENERIC_FP16_DOT; @@ -168,9 +168,9 @@ CpuModel midr_to_model(uint32_t midr) break; } } - else if(implementer == 0x51) + else if (implementer == 0x51) { - switch(cpunum) + switch (cpunum) { case 0x800: // A73 model = CpuModel::A73; @@ -196,4 +196,4 @@ CpuModel midr_to_model(uint32_t midr) return model; } } // namespace cpuinfo -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/common/cpuinfo/CpuModel.h b/src/common/cpuinfo/CpuModel.h index 4fe6c29e53..3b9d9e3494 100644 --- a/src/common/cpuinfo/CpuModel.h +++ b/src/common/cpuinfo/CpuModel.h @@ -24,11 +24,11 @@ #ifndef SRC_COMMON_CPUINFO_CPUMODEL_H #define SRC_COMMON_CPUINFO_CPUMODEL_H +#include "arm_compute/core/CPP/CPPTypes.h" + #include #include -#include "arm_compute/core/CPP/CPPTypes.h" - namespace arm_compute { namespace cpuinfo diff --git a/src/common/utils/LegacySupport.cpp b/src/common/utils/LegacySupport.cpp index 06b1693bd1..102644227e 100644 --- a/src/common/utils/LegacySupport.cpp +++ b/src/common/utils/LegacySupport.cpp @@ -33,7 +33,7 @@ namespace { DataType convert_to_legacy_data_type(AclDataType data_type) { - switch(data_type) + switch (data_type) { case AclDataType::AclFloat32: return DataType::F32; @@ -48,7 +48,7 @@ DataType convert_to_legacy_data_type(AclDataType data_type) AclDataType convert_to_c_data_type(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return AclDataType::AclFloat32; @@ -64,7 +64,7 @@ AclDataType convert_to_c_data_type(DataType data_type) TensorShape create_legacy_tensor_shape(int32_t ndims, int32_t *shape) { TensorShape legacy_shape{}; - for(int32_t d = 0; d < ndims; ++d) + for (int32_t d = 0; d < ndims; ++d) { legacy_shape.set(d, shape[d], false); } @@ -73,14 +73,14 @@ TensorShape create_legacy_tensor_shape(int32_t ndims, int32_t *shape) int32_t *create_tensor_shape_array(const TensorInfo &info) { const auto num_dims = info.num_dimensions(); - if(num_dims <= 0) + if (num_dims <= 0) { return nullptr; } int32_t *shape_array = new int32_t[num_dims]; - for(size_t d = 0; d < num_dims; ++d) + for (size_t d = 0; d < num_dims; ++d) { shape_array[d] = info.tensor_shape()[d]; } @@ -92,28 +92,23 @@ int32_t *create_tensor_shape_array(const TensorInfo &info) TensorInfo convert_to_legacy_tensor_info(const AclTensorDescriptor &desc) { TensorInfo legacy_desc; - legacy_desc.init(create_legacy_tensor_shape(desc.ndims, desc.shape), 1, convert_to_legacy_data_type(desc.data_type)); + legacy_desc.init(create_legacy_tensor_shape(desc.ndims, desc.shape), 1, + convert_to_legacy_data_type(desc.data_type)); return legacy_desc; } AclTensorDescriptor convert_to_descriptor(const TensorInfo &info) { const auto num_dims = info.num_dimensions(); - AclTensorDescriptor desc - { - static_cast(num_dims), - create_tensor_shape_array(info), - convert_to_c_data_type(info.data_type()), - nullptr, - 0 - }; + AclTensorDescriptor desc{static_cast(num_dims), create_tensor_shape_array(info), + convert_to_c_data_type(info.data_type()), nullptr, 0}; return desc; } ActivationLayerInfo convert_to_activation_info(const AclActivationDescriptor &desc) { ActivationLayerInfo::ActivationFunction act; - switch(desc.type) + switch (desc.type) { case AclActivationType::AclIdentity: act = ActivationLayerInfo::ActivationFunction::IDENTITY; diff --git a/src/common/utils/Log.h b/src/common/utils/Log.h index bbfe1ce1b3..6ebfed366e 100644 --- a/src/common/utils/Log.h +++ b/src/common/utils/Log.h @@ -38,20 +38,22 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/utils/logging/Macros.h" + #include "utils/TypePrinter.h" /** Create a logger * * @note It will eventually create all default loggers in don't exist */ -#define ARM_COMPUTE_CREATE_ACL_LOGGER() \ - do \ - { \ - if(arm_compute::logging::LoggerRegistry::get().logger("ComputeLibrary") == nullptr) \ - { \ - arm_compute::logging::LoggerRegistry::get().create_logger("ComputeLibrary", arm_compute::logging::LogLevel::INFO); \ - } \ - } while(false) +#define ARM_COMPUTE_CREATE_ACL_LOGGER() \ + do \ + { \ + if (arm_compute::logging::LoggerRegistry::get().logger("ComputeLibrary") == nullptr) \ + { \ + arm_compute::logging::LoggerRegistry::get().create_logger("ComputeLibrary", \ + arm_compute::logging::LogLevel::INFO); \ + } \ + } while (false) /** Log a message to the logger * @@ -63,7 +65,7 @@ { \ ARM_COMPUTE_CREATE_ACL_LOGGER(); \ ARM_COMPUTE_LOG_MSG("ComputeLibrary", log_level, msg); \ - } while(false) + } while (false) /** Log a message with format to the logger * @@ -76,7 +78,7 @@ { \ ARM_COMPUTE_CREATE_ACL_LOGGER(); \ ARM_COMPUTE_LOG_MSG_WITH_FORMAT("ComputeLibrary", log_level, fmt, __VA_ARGS__); \ - } while(false) + } while (false) /** Log an error message to the logger * @@ -87,7 +89,7 @@ { \ ARM_COMPUTE_CREATE_ACL_LOGGER(); \ ARM_COMPUTE_LOG_MSG("ComputeLibrary", arm_compute::logging::LogLevel::ERROR, msg); \ - } while(false) + } while (false) /** Log an error message to the logger with function name before the message * @@ -98,7 +100,7 @@ { \ ARM_COMPUTE_CREATE_ACL_LOGGER(); \ ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::ERROR, msg); \ - } while(false) + } while (false) /** Log an information message to the logger with function name before the message * @@ -109,7 +111,7 @@ { \ ARM_COMPUTE_CREATE_ACL_LOGGER(); \ ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::INFO, msg); \ - } while(false) + } while (false) /** Function template specialization for the out of bound element at index = tuple_size * @@ -131,12 +133,13 @@ logParamsImpl(std::vector &data_registry, const std::tuple & * @param[in] in_params_tuple Constant reference to a tuple of different input data types */ template -inline typename std::enable_if < Index::type -logParamsImpl(std::vector &data_registry, const std::tuple &in_params_tuple) + inline typename std::enable_if < + Index::type logParamsImpl(std::vector &data_registry, + const std::tuple &in_params_tuple) { data_registry.push_back(arm_compute::to_string(std::get(in_params_tuple))); // Unfold the next tuple element - logParamsImpl < Index + 1, Tp... > (data_registry, in_params_tuple); + logParamsImpl(data_registry, in_params_tuple); } /** Function Template with variable number of inputs to collect all the passed parameters from @@ -149,10 +152,10 @@ logParamsImpl(std::vector &data_registry, const std::tuple & * @return Vector of the parameters' data in a string format */ template -const std::vector logParams(Ts &&... ins) +const std::vector logParams(Ts &&...ins) { std::vector data_registry{}; - std::tuple in_params_tuple{ ins... }; + std::tuple in_params_tuple{ins...}; // Start logging the tuple elements, starting from 0 to tuple_size-1 logParamsImpl<0>(data_registry, in_params_tuple); @@ -178,11 +181,11 @@ inline const std::vector getParamsNames(const std::string &in_param // Usually the input parameters string would be name of parameters separated // by ',' e.g. "src0, src1, policy" - while(std::getline(ss, temp, ',')) + while (std::getline(ss, temp, ',')) { names.push_back(temp); } - for(auto &name : names) + for (auto &name : names) { // Totally get rid of white space characters name.erase(std::remove(name.begin(), name.end(), ' '), name.end()); @@ -205,7 +208,7 @@ inline const std::string constructDataLog(const std::vector ¶ms { std::string dataLog = "\n "; ARM_COMPUTE_ERROR_ON(params_names.size() != data_registry.size()); - for(uint8_t i = 0; i < params_names.size(); ++i) + for (uint8_t i = 0; i < params_names.size(); ++i) { dataLog += params_names[i] + ": " + data_registry.at(i) + "\n "; } @@ -220,11 +223,11 @@ inline const std::string constructDataLog(const std::vector ¶ms * * @param[in] ... Input parameters */ -#define ARM_COMPUTE_LOG_PARAMS(...) \ - do \ - { \ - ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL(constructDataLog(getParamsNames(#__VA_ARGS__), \ - logParams(__VA_ARGS__))); \ - } while(false) +#define ARM_COMPUTE_LOG_PARAMS(...) \ + do \ + { \ + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL( \ + constructDataLog(getParamsNames(#__VA_ARGS__), logParams(__VA_ARGS__))); \ + } while (false) #endif /* ARM_COMPUTE_LOGGING_ENABLED */ #endif /* SRC_COMMON_LOG_H */ diff --git a/src/common/utils/Macros.h b/src/common/utils/Macros.h index 2e44ea599e..35f7e759d3 100644 --- a/src/common/utils/Macros.h +++ b/src/common/utils/Macros.h @@ -28,7 +28,7 @@ #define ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status) \ { \ - if(status != arm_compute::StatusCode::Success) \ + if (status != arm_compute::StatusCode::Success) \ { \ return arm_compute::utils::as_cenum(status); \ } \ diff --git a/src/common/utils/Object.h b/src/common/utils/Object.h index 1f194737d4..b73de8e430 100644 --- a/src/common/utils/Object.h +++ b/src/common/utils/Object.h @@ -52,14 +52,12 @@ struct Header * @param[in] type_ Object identification type * @param[in] ctx_ Context to reference */ - Header(ObjectType type_, IContext *ctx_) noexcept - : type(type_), - ctx(ctx_) + Header(ObjectType type_, IContext *ctx_) noexcept : type(type_), ctx(ctx_) { } - ObjectType type{ ObjectType::Invalid }; - IContext *ctx{ nullptr }; + ObjectType type{ObjectType::Invalid}; + IContext *ctx{nullptr}; }; } // namespace detail } // namespace arm_compute diff --git a/src/common/utils/Utils.h b/src/common/utils/Utils.h index 1bd1c7ec57..33fe6c0e81 100644 --- a/src/common/utils/Utils.h +++ b/src/common/utils/Utils.h @@ -74,10 +74,7 @@ constexpr SE as_enum(const E val) noexcept template bool is_in(E check, std::initializer_list list) { - return std::any_of(list.begin(), list.end(), [&check](E e) - { - return check == e; - }); + return std::any_of(list.begin(), list.end(), [&check](E e) { return check == e; }); } } // namespace utils } // namespace arm_compute diff --git a/src/common/utils/Validate.h b/src/common/utils/Validate.h index 4e8807273a..97819c619f 100644 --- a/src/common/utils/Validate.h +++ b/src/common/utils/Validate.h @@ -29,7 +29,7 @@ #include -#define ARM_COMPUTE_ASSERT(cond) assert(cond) +#define ARM_COMPUTE_ASSERT(cond) assert(cond) #define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr) assert((ptr) != nullptr) #else /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */ diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp index ca2f7d238f..52be6990ab 100644 --- a/src/core/AccessWindowAutoPadding.cpp +++ b/src/core/AccessWindowAutoPadding.cpp @@ -28,12 +28,14 @@ using namespace arm_compute; -AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info) - : _info(info) +AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info) : _info(info) { } -ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const +ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const { ARM_COMPUTE_UNUSED(window); ARM_COMPUTE_UNUSED(input_valid_region); @@ -45,17 +47,17 @@ ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, ValidRegion AccessWindowAutoPadding::compute_valid_region() const { - if(_info == nullptr) + if (_info == nullptr) { return ValidRegion{}; } - return ValidRegion{ Coordinates(), _info->tensor_shape() }; + return ValidRegion{Coordinates(), _info->tensor_shape()}; } void AccessWindowAutoPadding::set_valid_region() { - if(_info == nullptr) + if (_info == nullptr) { return; } @@ -75,7 +77,7 @@ bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window) ARM_COMPUTE_UNUSED(window); // Only update the padding if the tensor allows it - if(_info == nullptr || !_info->is_resizable()) + if (_info == nullptr || !_info->is_resizable()) { return false; } diff --git a/src/core/AccessWindowAutoPadding.h b/src/core/AccessWindowAutoPadding.h index b8d1508679..406bdba0d8 100644 --- a/src/core/AccessWindowAutoPadding.h +++ b/src/core/AccessWindowAutoPadding.h @@ -74,9 +74,12 @@ public: ValidRegion compute_valid_region() const; // Inherited methods overridden: - bool update_window_if_needed(Window &window) const override; - bool update_padding_if_needed(const Window &window) override; - ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override; + bool update_window_if_needed(Window &window) const override; + bool update_padding_if_needed(const Window &window) override; + ValidRegion compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const override; private: ITensorInfo *_info; diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp index 0607011bc5..98182b1202 100644 --- a/src/core/AccessWindowStatic.cpp +++ b/src/core/AccessWindowStatic.cpp @@ -34,7 +34,10 @@ AccessWindowStatic::AccessWindowStatic(ITensorInfo *info, int start_x, int start { } -ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const +ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const { ARM_COMPUTE_UNUSED(border_undefined); ARM_COMPUTE_UNUSED(border_size); @@ -44,7 +47,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region) const { - if(_info == nullptr) + if (_info == nullptr) { return input_valid_region; } @@ -57,7 +60,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid // Start of the valid region is equal to the start of the static access but // never outside of the tensor. anchor.set(0, std::max(0, _start_x)); - if(_info->num_dimensions() > 1) + if (_info->num_dimensions() > 1) { anchor.set(1, std::max(0, _start_y)); } @@ -65,7 +68,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid // End of the valid region is equal to the end of the static access but // never outside of the tensor. shape.set(0, std::min(_end_x, _info->tensor_shape()[0])); - if(_info->num_dimensions() > 1) + if (_info->num_dimensions() > 1) { shape.set(1, std::min(_end_y, _info->tensor_shape()[1])); } @@ -75,7 +78,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegion &input_valid_region) { - if(_info != nullptr) + if (_info != nullptr) { _info->set_valid_region(compute_valid_region(window, input_valid_region)); } @@ -84,7 +87,7 @@ void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegio bool AccessWindowStatic::update_window_if_needed(Window &window) const { // If the padding is not enough and the tensor is not resizable, shrink the window to size 0 - if(_info == nullptr || _info->is_resizable()) + if (_info == nullptr || _info->is_resizable()) { return false; } @@ -96,48 +99,50 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const bool window_modified = false; // Calculate if padding is enough - if(_start_y < 0) + if (_start_y < 0) { const int front_pad_y_available = -static_cast(offset_first_element / strides[1]); - if(_start_y < front_pad_y_available) + if (_start_y < front_pad_y_available) { window_modified = true; } } - if(!window_modified) + if (!window_modified) { - if(_end_y > static_cast(shape[1])) + if (_end_y > static_cast(shape[1])) { const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size(); const int tail_pad_y_available = (stride_z / strides[1]) - shape[1]; - if(static_cast(shape[1]) + tail_pad_y_available < _end_y) + if (static_cast(shape[1]) + tail_pad_y_available < _end_y) { window_modified = true; } } - if(!window_modified) + if (!window_modified) { const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size(); - if(_start_x < 0) + if (_start_x < 0) { - const int front_pad_x_available = -std::min(static_cast(offset_first_element), stride_y - shape[0] * strides[0]) / static_cast(strides[0]); + const int front_pad_x_available = + -std::min(static_cast(offset_first_element), stride_y - shape[0] * strides[0]) / + static_cast(strides[0]); - if(_start_x < front_pad_x_available) + if (_start_x < front_pad_x_available) { window_modified = true; } } - if(!window_modified && _end_x > static_cast(shape[0])) + if (!window_modified && _end_x > static_cast(shape[0])) { const int tail_pad_x_available = (stride_y / strides[0]) - shape[0]; - if(static_cast(shape[0]) + tail_pad_x_available < _end_x) + if (static_cast(shape[0]) + tail_pad_x_available < _end_x) { window_modified = true; } @@ -146,9 +151,9 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const } // If padding is not enough - if(window_modified) + if (window_modified) { - for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i) { window.set(i, Window::Dimension(0, 0, 1)); } @@ -162,7 +167,7 @@ bool AccessWindowStatic::update_padding_if_needed(const Window &window) ARM_COMPUTE_UNUSED(window); // Only update the padding if the tensor allows it - if(_info == nullptr || !_info->is_resizable()) + if (_info == nullptr || !_info->is_resizable()) { return false; } diff --git a/src/core/AccessWindowStatic.h b/src/core/AccessWindowStatic.h index f7d43cbb55..5c6d2c7db0 100644 --- a/src/core/AccessWindowStatic.h +++ b/src/core/AccessWindowStatic.h @@ -86,9 +86,12 @@ public: ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const; // Inherited methods overriden: - bool update_window_if_needed(Window &window) const override; - bool update_padding_if_needed(const Window &window) override; - ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override; + bool update_window_if_needed(Window &window) const override; + bool update_padding_if_needed(const Window &window) override; + ValidRegion compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const override; private: ITensorInfo *_info; diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp index d8bd4c4de1..42f0081c14 100644 --- a/src/core/AccessWindowTranspose.cpp +++ b/src/core/AccessWindowTranspose.cpp @@ -29,9 +29,12 @@ using namespace arm_compute; -ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const +ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const { - if(_info == nullptr) + if (_info == nullptr) { return input_valid_region; } @@ -41,7 +44,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va Coordinates old_anchor(anchor); TensorShape old_shape(shape); - if(!border_undefined) + if (!border_undefined) { border_size = BorderSize(0); } @@ -53,7 +56,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va // the kernel to write back output values. // As the relation between input and output is transposed window.y() is // used for x anchor and window.x() for y anchor. - if(_info->dimension(0) > 1) + if (_info->dimension(0) > 1) { anchor.set(0, std::max(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x); } @@ -69,15 +72,19 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va // a size of the region. // As the relation between input and output is transposed window.y() is // used for x shape and window.x() for y shape. - if(_info->dimension(0) > 1) + if (_info->dimension(0) > 1) { - shape.set(0, std::min((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]); + shape.set(0, std::min((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, + (window.y().end() - window.y().step()) * _scale_x + _width) - + anchor[0]); } - shape.set(1, std::min((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]); + shape.set(1, std::min((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, + (window.x().end() - window.x().step()) * _scale_y + _height) - + anchor[1]); // For higher dimensions use the intersection of the window size and the // valid region of the input - for(size_t d = 2; d < _info->num_dimensions(); ++d) + for (size_t d = 2; d < _info->num_dimensions(); ++d) { anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d])); shape.set(d, std::min(window[d].end(), input_valid_region.shape[d]) - anchor[d]); @@ -89,7 +96,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va bool AccessWindowTranspose::update_window_if_needed(Window &window) const { // Only update the window size if we can't use padding - if(_info == nullptr || _info->is_resizable()) + if (_info == nullptr || _info->is_resizable()) { return false; } @@ -107,12 +114,12 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const const int max_y = window.x().end() * _scale_y + _y; // Adjust window start for output's Y dimension (so X in (input) window) - if(min_y < 0) + if (min_y < 0) { // Calculate rows available above the tensor const int front_pad_y_available = -offset_first_element / strides[1]; - if(min_y < front_pad_y_available) + if (min_y < front_pad_y_available) { // Not enough padding available, need to shrink the window const int start = adjust_up(min_y, front_pad_y_available, window.x().step() * _scale_y) - _y; @@ -126,17 +133,18 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const } // Adjust window end for Y dimension - if(max_y > static_cast(shape[1])) + if (max_y > static_cast(shape[1])) { const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size(); // Calculate rows available below the tensor const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y; - if(static_cast(shape[1]) + tail_pad_y_available < max_y) + if (static_cast(shape[1]) + tail_pad_y_available < max_y) { // Not enough padding available, need to shrink the window - const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) + window.x().step() * _scale_y - _y - _height; + const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) + + window.x().step() * _scale_y - _y - _height; window.set(0, Window::Dimension(window.x().start(), end / _scale_y, window.x().step())); window_modified = true; } @@ -151,11 +159,14 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size(); // Adjust window start for X dimension - if(min_x < 0) + if (min_x < 0) { - const int front_pad_x_available = -std::min(static_cast(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast(strides[0]); + const int front_pad_x_available = + -std::min(static_cast(offset_first_element) - front_pad_y * strides[1], + stride_y - shape[0] * strides[0]) / + static_cast(strides[0]); - if(min_x < front_pad_x_available) + if (min_x < front_pad_x_available) { // Not enough padding available, need to shrink the window const int start = adjust_up(min_x, front_pad_x_available, window.y().step() * _scale_x) - _x; @@ -168,14 +179,15 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const } // Adjust window end for X dimension - if(max_x > static_cast(shape[0])) + if (max_x > static_cast(shape[0])) { const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x; - if(static_cast(shape[0]) + tail_pad_x_available < max_x) + if (static_cast(shape[0]) + tail_pad_x_available < max_x) { // Not enough padding available, need to shrink the window - const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) + window.y().step() * _scale_x - _x - _width; + const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) + + window.y().step() * _scale_x - _x - _width; window.set(1, Window::Dimension(window.y().start(), end / _scale_x, window.y().step())); window_modified = true; } @@ -189,7 +201,7 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const bool AccessWindowTranspose::update_padding_if_needed(const Window &window) { // Only update the padding if the tensor allows it - if(_info == nullptr || !_info->is_resizable()) + if (_info == nullptr || !_info->is_resizable()) { return false; } diff --git a/src/core/AccessWindowTranspose.h b/src/core/AccessWindowTranspose.h index 0306076d6e..12bb9a535b 100644 --- a/src/core/AccessWindowTranspose.h +++ b/src/core/AccessWindowTranspose.h @@ -42,7 +42,10 @@ public: bool update_window_if_needed(Window &window) const override; bool update_padding_if_needed(const Window &window) override; using AccessWindowRectangle::compute_valid_region; - ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override; + ValidRegion compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const override; }; } // namespace arm_compute #endif /*ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H*/ diff --git a/src/core/CL/CLCommandBuffer.cpp b/src/core/CL/CLCommandBuffer.cpp index 7fcfdf2c89..d094dcdaea 100644 --- a/src/core/CL/CLCommandBuffer.cpp +++ b/src/core/CL/CLCommandBuffer.cpp @@ -38,7 +38,7 @@ std::unique_ptr CLCommandBuffer::create(cl_command_queue queue) const auto &cl_device = CLKernelLibrary::get().get_device(); const auto has_mutable_dispatch = command_buffer_mutable_dispatch_supported(cl_device); - if(has_mutable_dispatch) + if (has_mutable_dispatch) { return std::make_unique(queue); } diff --git a/src/core/CL/CLCommandBuffer.h b/src/core/CL/CLCommandBuffer.h index 8a94e389fa..90e434161e 100644 --- a/src/core/CL/CLCommandBuffer.h +++ b/src/core/CL/CLCommandBuffer.h @@ -87,7 +87,8 @@ public: * @param[in] global The global work size. * @param[in] local The local work size. */ - virtual void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0; + virtual void + add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0; /** Add the mutable argument to the current kernel enqueue command. * @@ -154,7 +155,7 @@ protected: CLCommandBuffer &state(State state); private: - State _state{ State::Created }; + State _state{State::Created}; }; } // namespace arm_compute diff --git a/src/core/CL/CLCompatCommandBuffer.cpp b/src/core/CL/CLCompatCommandBuffer.cpp index f1a902c7b9..242fd7719c 100644 --- a/src/core/CL/CLCompatCommandBuffer.cpp +++ b/src/core/CL/CLCompatCommandBuffer.cpp @@ -31,8 +31,7 @@ namespace arm_compute { -CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) - : _queue(queue) +CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) : _queue(queue) { } @@ -40,11 +39,14 @@ CLCompatCommandBuffer::~CLCompatCommandBuffer() { } -void CLCompatCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) +void CLCompatCommandBuffer::add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) { ARM_COMPUTE_ERROR_ON(state() != State::Created); - _kernel_cmds.push_back(KernelCommand{ kernel, offset, global, local, {} }); + _kernel_cmds.push_back(KernelCommand{kernel, offset, global, local, {}}); } void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) @@ -52,7 +54,7 @@ void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const ARM_COMPUTE_ERROR_ON(state() != State::Created); ARM_COMPUTE_ERROR_ON(_kernel_cmds.empty()); - _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{ arg_idx, size, value }); + _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{arg_idx, size, value}); } void CLCompatCommandBuffer::finalize() @@ -61,7 +63,7 @@ void CLCompatCommandBuffer::finalize() _kernel_cmds.shrink_to_fit(); - for(auto &cmd : _kernel_cmds) + for (auto &cmd : _kernel_cmds) { cmd.mutable_args.shrink_to_fit(); } @@ -80,25 +82,19 @@ void CLCompatCommandBuffer::enqueue() { ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - for(const auto &cmd : _kernel_cmds) + for (const auto &cmd : _kernel_cmds) { - for(const auto &arg : cmd.mutable_args) + for (const auto &arg : cmd.mutable_args) { const auto error = clSetKernelArg(cmd.kernel, arg.arg_index, arg.arg_size, arg.arg_value); handle_cl_error("clSetKernelArg", error); } - const auto error = clEnqueueNDRangeKernel( - _queue, - cmd.kernel, - static_cast(cmd.global.dimensions()), - cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, - cmd.global.get(), - cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, - 0, - nullptr, - nullptr); + const auto error = + clEnqueueNDRangeKernel(_queue, cmd.kernel, static_cast(cmd.global.dimensions()), + cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, cmd.global.get(), + cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, 0, nullptr, nullptr); handle_cl_error("clEnqueueNDRangeKernel", error); } diff --git a/src/core/CL/CLCompatCommandBuffer.h b/src/core/CL/CLCompatCommandBuffer.h index e91d52d2d6..d5df106425 100644 --- a/src/core/CL/CLCompatCommandBuffer.h +++ b/src/core/CL/CLCompatCommandBuffer.h @@ -57,7 +57,10 @@ public: /** Disallow move assignment. */ CLCompatCommandBuffer &operator=(CLCompatCommandBuffer &&) = delete; - void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override; + void add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) override; void finalize() override; diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp index 2d024f9c2f..9bbc32657e 100644 --- a/src/core/CL/CLCompileContext.cpp +++ b/src/core/CL/CLCompileContext.cpp @@ -22,19 +22,19 @@ * SOFTWARE. */ #include "arm_compute/core/CL/CLCompileContext.h" -#include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Utils.h" + #include "support/StringSupport.h" #include namespace arm_compute { -CLBuildOptions::CLBuildOptions() - : _build_opts() +CLBuildOptions::CLBuildOptions() : _build_opts() { } @@ -45,7 +45,7 @@ void CLBuildOptions::add_option(std::string option) void CLBuildOptions::add_option_if(bool cond, std::string option) { - if(cond) + if (cond) { add_option(std::move(option)); } @@ -63,7 +63,7 @@ void CLBuildOptions::add_options(const StringSet &options) void CLBuildOptions::add_options_if(bool cond, const StringSet &options) { - if(cond) + if (cond) { add_options(options); } @@ -79,26 +79,35 @@ bool CLBuildOptions::operator==(const CLBuildOptions &other) const return _build_opts == other._build_opts; } -Program::Program() - : _context(), _device(), _is_binary(false), _name(), _source(), _binary() +Program::Program() : _context(), _device(), _is_binary(false), _name(), _source(), _binary() { } Program::Program(cl::Context context, std::string name, std::string source) - : _context(std::move(context)), _device(), _is_binary(false), _name(std::move(name)), _source(std::move(source)), _binary() + : _context(std::move(context)), + _device(), + _is_binary(false), + _name(std::move(name)), + _source(std::move(source)), + _binary() { } Program::Program(cl::Context context, cl::Device device, std::string name, std::vector binary) - : _context(std::move(context)), _device(std::move(device)), _is_binary(true), _name(std::move(name)), _source(), _binary(std::move(binary)) + : _context(std::move(context)), + _device(std::move(device)), + _is_binary(true), + _name(std::move(name)), + _source(), + _binary(std::move(binary)) { } Program::operator cl::Program() const { - if(_is_binary) + if (_is_binary) { - return cl::Program(_context, { _device }, { _binary }); + return cl::Program(_context, {_device}, {_binary}); } else { @@ -112,12 +121,12 @@ bool Program::build(const cl::Program &program, const std::string &build_options { return program.build(build_options.c_str()) == CL_SUCCESS; } - catch(const cl::Error &e) + catch (const cl::Error &e) { cl_int err = CL_SUCCESS; const auto build_info = program.getBuildInfo(&err); - for(auto &pair : build_info) + for (auto &pair : build_info) { std::cerr << pair.second << std::endl; } @@ -133,14 +142,12 @@ cl::Program Program::build(const std::string &build_options) const return cl_program; } -Kernel::Kernel() - : _name(), _kernel() +Kernel::Kernel() : _name(), _kernel() { } Kernel::Kernel(std::string name, const cl::Program &program) - : _name(std::move(name)), - _kernel(cl::Kernel(program, _name.c_str())) + : _name(std::move(name)), _kernel(cl::Kernel(program, _name.c_str())) { } CLCompileContext::CLCompileContext() @@ -156,15 +163,19 @@ CLCompileContext::CLCompileContext(cl::Context context, const cl::Device &device _is_wbsm_supported = get_wbsm_support_info(device); } -Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source, - const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const +Kernel CLCompileContext::create_kernel(const std::string &kernel_name, + const std::string &program_name, + const std::string &program_source, + const std::string &kernel_path, + const StringSet &build_options_set, + bool is_binary) const { const std::string build_options = generate_build_options(build_options_set, kernel_path); const std::string built_program_name = program_name + "_" + build_options; auto built_program_it = _built_programs_map.find(built_program_name); cl::Program cl_program; - if(_built_programs_map.end() != built_program_it) + if (_built_programs_map.end() != built_program_it) { // If program has been built, retrieve to create kernel from it cl_program = built_program_it->second; @@ -184,11 +195,12 @@ Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std return Kernel(kernel_name, cl_program); } -const Program &CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const +const Program & +CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const { const auto program_it = _programs_map.find(program_name); - if(program_it != _programs_map.end()) + if (program_it != _programs_map.end()) { return program_it->second; } @@ -199,9 +211,10 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c ARM_COMPUTE_UNUSED(is_binary); program = Program(_context, program_name, program_source); #else /* EMBEDDED_KERNELS */ - if(is_binary) + if (is_binary) { - program = Program(_context, _device.cl_device(), program_name, std::vector(program_source.begin(), program_source.end())); + program = Program(_context, _device.cl_device(), program_name, + std::vector(program_source.begin(), program_source.end())); } else { @@ -218,18 +231,19 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c void CLCompileContext::set_context(cl::Context context) { _context = std::move(context); - if(_context.get() != nullptr) + if (_context.get() != nullptr) { const auto cl_devices = _context.getInfo(); - if(!cl_devices.empty()) + if (!cl_devices.empty()) { _device = CLDevice(cl_devices[0]); } } } -std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const +std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, + const std::string &kernel_path) const { std::string concat_str; bool ext_supported = false; @@ -241,27 +255,27 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) GPUTarget gpu_arch = get_arch_from_target(_device.target()); - concat_str += " -DGPU_ARCH=" + support::cpp11::to_string( - static_cast::type>(gpu_arch)); + concat_str += + " -DGPU_ARCH=" + support::cpp11::to_string(static_cast::type>(gpu_arch)); - if(_device.supported("cl_khr_fp16")) + if (_device.supported("cl_khr_fp16")) { concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; } - if(_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product")) + if (_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product")) { concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ENABLED=1 "; } - if(_device.supported("cl_arm_integer_dot_product_accumulate_int8")) + if (_device.supported("cl_arm_integer_dot_product_accumulate_int8")) { concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED=1 "; } std::tie(ext_supported, ext_buildopts) = _device.is_non_uniform_workgroup_supported(); - if(ext_supported) + if (ext_supported) { concat_str += ext_buildopts; } @@ -270,7 +284,7 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!"); } - if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11) + if (gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11) { concat_str += " -DUNROLL_WITH_PRAGMA "; } @@ -295,7 +309,7 @@ std::string CLCompileContext::stringify_set(const StringSet &s, const std::strin #endif /* EMBEDDED_KERNELS */ // Concatenate set - for(const auto &el : s) + for (const auto &el : s) { concat_set += " " + el; } @@ -340,7 +354,7 @@ cl::NDRange CLCompileContext::default_ndrange() const GPUTarget _target = get_target_from_device(_device.cl_device()); cl::NDRange default_range; - switch(_target) + switch (_target) { case GPUTarget::MIDGARD: case GPUTarget::T600: @@ -370,7 +384,8 @@ size_t CLCompileContext::max_local_workgroup_size(const cl::Kernel &kernel) cons size_t result; size_t err = kernel.getWorkGroupInfo(_device.cl_device(), CL_KERNEL_WORK_GROUP_SIZE, &result); - ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + ARM_COMPUTE_ERROR_ON_MSG(err != 0, + "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); ARM_COMPUTE_UNUSED(err); return result; @@ -392,7 +407,7 @@ int32_t CLCompileContext::get_ddk_version() const const std::regex ddk_regex("r([0-9]*)p[0-9]"); std::smatch ddk_match; - if(std::regex_search(device_version, ddk_match, ddk_regex)) + if (std::regex_search(device_version, ddk_match, ddk_regex)) { return std::stoi(ddk_match[1]); } diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index 78f36100d5..5ea99d360a 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -22,14 +22,15 @@ * SOFTWARE. */ #include "arm_compute/core/CL/CLHelpers.h" + #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLTypes.h" -#include "arm_compute/core/utils/DataTypeUtils.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Log.h" #include "arm_compute/core/Types.h" -#include "src/gpu/cl/ClCompileContext.h" +#include "arm_compute/core/utils/DataTypeUtils.h" +#include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/ClKernelLibrary.h" #include @@ -39,7 +40,7 @@ namespace arm_compute { std::string get_cl_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -75,7 +76,7 @@ std::string get_cl_type_from_data_type(const DataType &dt) std::string get_cl_promoted_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -105,7 +106,7 @@ std::string get_cl_promoted_type_from_data_type(const DataType &dt) std::string get_cl_unsigned_type_from_element_size(size_t element_size) { - switch(element_size) + switch (element_size) { case 1: return "uchar"; @@ -123,7 +124,7 @@ std::string get_cl_unsigned_type_from_element_size(size_t element_size) std::string get_cl_signed_type_from_element_size(size_t element_size) { - switch(element_size) + switch (element_size) { case 1: return "char"; @@ -141,7 +142,7 @@ std::string get_cl_signed_type_from_element_size(size_t element_size) std::string get_cl_select_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -174,7 +175,7 @@ std::string get_cl_select_type_from_data_type(const DataType &dt) std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -192,7 +193,7 @@ std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt) std::string get_data_size_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::S8: @@ -244,8 +245,9 @@ bool dot8_supported(const cl::Device &device) const GPUTarget gpu_target = get_target_from_name(device_name); // SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8 - std::set sw_workaround_issue = { GPUTarget::G76 }; - return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0); + std::set sw_workaround_issue = {GPUTarget::G76}; + return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || + sw_workaround_issue.count(gpu_target) != 0); } bool dot8_acc_supported(const cl::Device &device) @@ -256,23 +258,23 @@ bool dot8_acc_supported(const cl::Device &device) CLVersion get_cl_version(const cl::Device &device) { std::string version_str = device.getInfo(); - if(version_str.find("OpenCL 3") != std::string::npos) + if (version_str.find("OpenCL 3") != std::string::npos) { return CLVersion::CL30; } - else if(version_str.find("OpenCL 2") != std::string::npos) + else if (version_str.find("OpenCL 2") != std::string::npos) { return CLVersion::CL20; } - else if(version_str.find("OpenCL 1.2") != std::string::npos) + else if (version_str.find("OpenCL 1.2") != std::string::npos) { return CLVersion::CL12; } - else if(version_str.find("OpenCL 1.1") != std::string::npos) + else if (version_str.find("OpenCL 1.1") != std::string::npos) { return CLVersion::CL11; } - else if(version_str.find("OpenCL 1.0") != std::string::npos) + else if (version_str.find("OpenCL 1.0") != std::string::npos) { return CLVersion::CL10; } @@ -287,14 +289,15 @@ bool device_supports_extension(const cl::Device &device, const char *extension_n return (pos != std::string::npos); } -bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout) +bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, + const Size2D &kernel_size, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN); using WinogradConfiguration = std::pair, std::pair>; - std::vector winograd_configs_nchw = - { + std::vector winograd_configs_nchw = { WinogradConfiguration(std::pair(1, 2), std::pair(1, 3)), WinogradConfiguration(std::pair(1, 4), std::pair(1, 3)), WinogradConfiguration(std::pair(2, 1), std::pair(3, 1)), @@ -303,11 +306,9 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si WinogradConfiguration(std::pair(4, 4), std::pair(3, 3)), WinogradConfiguration(std::pair(4, 4), std::pair(5, 5)), WinogradConfiguration(std::pair(4, 1), std::pair(5, 1)), - WinogradConfiguration(std::pair(1, 4), std::pair(1, 5)) - }; + WinogradConfiguration(std::pair(1, 4), std::pair(1, 5))}; - std::vector winograd_configs_nhwc = - { + std::vector winograd_configs_nhwc = { WinogradConfiguration(std::pair(2, 2), std::pair(3, 3)), WinogradConfiguration(std::pair(1, 4), std::pair(1, 3)), WinogradConfiguration(std::pair(4, 1), std::pair(3, 1)), @@ -324,19 +325,21 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si std::pair(kernel_size.width, kernel_size.height)); // Return true if supported - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != winograd_configs_nchw.end()); + return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != + winograd_configs_nchw.end()); } else { - return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end()); + return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != + winograd_configs_nhwc.end()); } } size_t preferred_vector_width(const cl::Device &device, const DataType dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::S8: @@ -382,7 +385,7 @@ size_t get_cl_image_pitch_alignment(const cl::Device &device) cl_int err = clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &pixel_aligment, nullptr); - if(err == CL_SUCCESS) + if (err == CL_SUCCESS) { return pixel_aligment; } @@ -396,12 +399,14 @@ bool get_cl_non_uniform_work_group_supported(const cl::Device &device) { cl_bool supported = CL_FALSE; - cl_int err = clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr); + cl_int err = + clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr); return (err == CL_SUCCESS && supported == CL_TRUE); } -cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set &build_opts) +cl::Kernel +create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set &build_opts) { opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get(); @@ -409,7 +414,8 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_ auto kernel_src = klib.program(program_name); const std::string kernel_path = klib.kernel_path(); - return static_cast(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path, build_opts, kernel_src.is_binary)); + return static_cast(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path, + build_opts, kernel_src.is_binary)); } cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size) @@ -423,8 +429,9 @@ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimensio bool get_wbsm_support_info(const cl::Device &device) { cl_bitfield capabilities = 0; - cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr); - if((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM)) + cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield), + &capabilities, nullptr); + if ((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM)) { return true; } @@ -433,35 +440,33 @@ bool get_wbsm_support_info(const cl::Device &device) void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint) { - cl_int err = clSetKernelExecInfo(kernel.get(), - CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, - sizeof(cl_int), - &wbsm_hint); + cl_int err = clSetKernelExecInfo(kernel.get(), CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, + sizeof(cl_int), &wbsm_hint); ARM_COMPUTE_UNUSED(err); ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS); } bool export_to_cl_image(const ITensorInfo *tensor) { - if(tensor->tensor_shape()[0] % 4 != 0) + if (tensor->tensor_shape()[0] % 4 != 0) { return false; } // If not floating point - if(!is_data_type_float(tensor->data_type())) + if (!is_data_type_float(tensor->data_type())) { return false; } // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform - if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) + if (!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) { return false; } // Check cl image pitch alignment - if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) + if (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) { return false; } @@ -471,7 +476,7 @@ bool export_to_cl_image(const ITensorInfo *tensor) const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); - if(image_w > max_image_w || image_h > max_image_h) + if (image_w > max_image_w || image_h > max_image_h) { return false; } @@ -481,9 +486,9 @@ bool export_to_cl_image(const ITensorInfo *tensor) void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list values) { - for(const int value : values) + for (const int value : values) { - if(value > max_manual_loop_unrolling) + if (value > max_manual_loop_unrolling) { built_opts.add_option("-DUNROLL_WITH_PRAGMA"); return; diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index c5a0796c3a..e69d006750 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -22,8 +22,11 @@ * SOFTWARE. */ #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "arm_compute/core/Error.h" + #include "src/gpu/cl/ClKernelLibrary.h" + #include #include #include @@ -31,8 +34,7 @@ #include namespace arm_compute { -CLKernelLibrary::CLKernelLibrary() - : _compile_context() +CLKernelLibrary::CLKernelLibrary() : _compile_context() { opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the CLKernelLibrary is built } @@ -41,13 +43,15 @@ CLKernelLibrary &CLKernelLibrary::get() static CLKernelLibrary _kernel_library; return _kernel_library; } -Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const std::set &build_options_set) const +Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, + const std::set &build_options_set) const { const opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get(); const std::string program_name = klib.program_name(kernel_name); auto program = klib.program(program_name); const std::string &kernel_path = CLKernelLibrary::get().get_kernel_path(); - return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set, program.is_binary); + return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set, + program.is_binary); } std::string CLKernelLibrary::get_program_name(const std::string &kernel_name) const { @@ -131,4 +135,4 @@ CLCompileContext &CLKernelLibrary::get_compile_context() { return _compile_context; } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp index b9c59ac6f0..05b351fc25 100644 --- a/src/core/CL/CLMutableCommandBuffer.cpp +++ b/src/core/CL/CLMutableCommandBuffer.cpp @@ -31,8 +31,7 @@ namespace arm_compute { -CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) - : CLCommandBuffer() +CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLCommandBuffer() { cl_int status = CL_SUCCESS; @@ -52,7 +51,10 @@ CLMutableCommandBuffer::~CLMutableCommandBuffer() handle_cl_error("clReleaseCommandBufferKHR", status); } -void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) +void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) { ARM_COMPUTE_ERROR_ON(state() != State::Created); @@ -65,18 +67,8 @@ void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &off }; const auto error = clCommandNDRangeKernelKHR( - _cb, - nullptr, - properties, - kernel, - global.dimensions(), - offset.dimensions() != 0 ? offset.get() : nullptr, - global.get(), - local.dimensions() != 0 ? local.get() : nullptr, - 0, - nullptr, - nullptr, - &mutable_handle); + _cb, nullptr, properties, kernel, global.dimensions(), offset.dimensions() != 0 ? offset.get() : nullptr, + global.get(), local.dimensions() != 0 ? local.get() : nullptr, 0, nullptr, nullptr, &mutable_handle); handle_cl_error("clCommandNDRangeKernelKHR", error); @@ -114,7 +106,7 @@ void CLMutableCommandBuffer::finalize() size_t arg_no = 0; - for(auto &mut_dispatch_cfg : _mut_dispatch_cfgs) + for (auto &mut_dispatch_cfg : _mut_dispatch_cfgs) { ARM_COMPUTE_ERROR_ON(arg_no >= _mut_arg_cfgs.size()); mut_dispatch_cfg.arg_list = &_mut_arg_cfgs[arg_no]; @@ -132,9 +124,7 @@ void CLMutableCommandBuffer::update() { ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - const auto error = clUpdateMutableCommandsKHR( - _cb, - &_mut_cfg); + const auto error = clUpdateMutableCommandsKHR(_cb, &_mut_cfg); handle_cl_error("clUpdateMutableCommandsKHR", error); } @@ -143,13 +133,7 @@ void CLMutableCommandBuffer::enqueue() { ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - const auto error = clEnqueueCommandBufferKHR( - 0, - nullptr, - _cb, - 0, - nullptr, - nullptr); + const auto error = clEnqueueCommandBufferKHR(0, nullptr, _cb, 0, nullptr, nullptr); handle_cl_error("clEnqueueCommandBufferKHR", error); } diff --git a/src/core/CL/CLMutableCommandBuffer.h b/src/core/CL/CLMutableCommandBuffer.h index 04e94b0bb2..8997d7d1fd 100644 --- a/src/core/CL/CLMutableCommandBuffer.h +++ b/src/core/CL/CLMutableCommandBuffer.h @@ -57,7 +57,10 @@ public: /** Disallow move assignment. */ CLMutableCommandBuffer &operator=(CLMutableCommandBuffer &&) = delete; - void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override; + void add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) override; void finalize() override; diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp index 289300b3a1..290ed32648 100644 --- a/src/core/CL/CLUtils.cpp +++ b/src/core/CL/CLUtils.cpp @@ -26,9 +26,10 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "support/StringSupport.h" namespace arm_compute @@ -38,15 +39,15 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); const cl::Context &ctx = CLKernelLibrary::get().context(); - const cl::Buffer &buffer = tensor->cl_buffer(); + const cl::Buffer &buffer = tensor->cl_buffer(); const ITensorInfo *info = tensor->info(); - ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(), - "Tensor paddings must not be locked to allow extending paddings to satisfy cl_image pitch alignment requirement"); + ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(), "Tensor paddings must not be locked to allow extending paddings to " + "satisfy cl_image pitch alignment requirement"); - const size_t image_w{ info->dimension(0) / 4 }; - const size_t image_h{ info->tensor_shape().total_size() / info->dimension(0) }; - const size_t max_image_w{ CLKernelLibrary::get().get_device().getInfo() }; - const size_t max_image_h{ CLKernelLibrary::get().get_device().getInfo() }; + const size_t image_w{info->dimension(0) / 4}; + const size_t image_h{info->tensor_shape().total_size() / info->dimension(0)}; + const size_t max_image_w{CLKernelLibrary::get().get_device().getInfo()}; + const size_t max_image_h{CLKernelLibrary::get().get_device().getInfo()}; ARM_COMPUTE_UNUSED(max_image_w, max_image_h); ARM_COMPUTE_ERROR_ON_MSG(image_w > max_image_w, "Image width exceeds maximum width for exporting to cl_image"); @@ -58,18 +59,22 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im return create_image2d_from_buffer(ctx, buffer, shape2d, info->data_type(), image_row_pitch, image_type); } -cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type) +cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, + const cl::Buffer &buffer, + const TensorShape &shape2d, + DataType data_type, + size_t image_row_pitch, + CLImage2DType image_type) { ARM_COMPUTE_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); ARM_COMPUTE_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); - ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr, - "Cannot create cl_image from empty cl_buffer"); + ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr, "Cannot create cl_image from empty cl_buffer"); cl_channel_type cl_data_type; - switch(data_type) + switch (data_type) { case DataType::F32: cl_data_type = CL_FLOAT; @@ -84,7 +89,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer cl_mem cl_image; cl_int err = CL_SUCCESS; - const cl_image_format format = { CL_RGBA, cl_data_type }; + const cl_image_format format = {CL_RGBA, cl_data_type}; cl_image_desc desc; memset(&desc, 0, sizeof(desc)); @@ -94,7 +99,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer desc.image_width = shape2d[0]; desc.image_height = shape2d[1]; - switch(image_type) + switch (image_type) { case CLImage2DType::ReadOnly: cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err); @@ -114,7 +119,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer void handle_cl_error(const std::string &function_name, cl_int error_code) { - if(error_code != CL_SUCCESS) + if (error_code != CL_SUCCESS) { std::string error_message = function_name + " - Error code: " + std::to_string(error_code); ARM_COMPUTE_ERROR(error_message.c_str()); diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h index de9c1b3194..f9dcfeac3a 100644 --- a/src/core/CL/CLUtils.h +++ b/src/core/CL/CLUtils.h @@ -72,7 +72,12 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im * * @return cl::Image2D object */ -cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type); +cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, + const cl::Buffer &buffer, + const TensorShape &shape2d, + DataType data_type, + size_t image_row_pitch, + CLImage2DType image_type); /** Check for CL error code and throw exception accordingly. * diff --git a/src/core/CL/CLValidate.h b/src/core/CL/CLValidate.h index 7b5294e452..50d224f1c0 100644 --- a/src/core/CL/CLValidate.h +++ b/src/core/CL/CLValidate.h @@ -29,11 +29,13 @@ namespace arm_compute { -#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \ - ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported())) +#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \ + CLKernelLibrary::get().fp16_supported())) -#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \ - ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported())) +#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \ + CLKernelLibrary::get().fp16_supported())) /** Return an error if int64_base_atomics extension is not supported by the device. * @@ -43,11 +45,13 @@ namespace arm_compute * * @return Status */ -inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line) +inline arm_compute::Status +error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line) { - if(!CLKernelLibrary::get().int64_base_atomics_supported()) + if (!CLKernelLibrary::get().int64_base_atomics_supported()) { - return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported"); + return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, + "Atomic functions are not supported"); } return arm_compute::Status{}; } diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp index a53fdbbab6..f96b24d2a9 100644 --- a/src/core/CL/DefaultLWSHeuristics.cpp +++ b/src/core/CL/DefaultLWSHeuristics.cpp @@ -31,13 +31,13 @@ cl::NDRange get_gemm_lws(size_t gws_x, size_t gws_y, size_t gws_z) { ARM_COMPUTE_UNUSED(gws_y); - if(gws_z != 1) + if (gws_z != 1) { return cl::NDRange(4, 4, 2); } else { - if(gws_x > 256) + if (gws_x > 256) { return cl::NDRange(2, 16, 1); } @@ -59,9 +59,9 @@ cl::NDRange get_direct_lws(size_t gws_x, size_t gws_y, size_t gws_z) { ARM_COMPUTE_UNUSED(gws_z); - if(gws_x < gws_y) + if (gws_x < gws_y) { - if(gws_x < 4) + if (gws_x < 4) { return cl::NDRange(std::min(gws_x, static_cast(2u)), 32, 1); } @@ -81,7 +81,7 @@ cl::NDRange get_dwc_lws(size_t gws_x, size_t gws_y, size_t gws_z) ARM_COMPUTE_UNUSED(gws_y); ARM_COMPUTE_UNUSED(gws_z); - if(gws_x < 32) + if (gws_x < 32) { return cl::NDRange(gws_x, 4, 4); } @@ -100,7 +100,7 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws) const size_t gws_y = gws[1]; const size_t gws_z = gws[2]; - switch(kernel_type) + switch (kernel_type) { case CLKernelType::GEMM: { @@ -124,4 +124,4 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws) } } } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index dc3a86a528..ac53e7f1d2 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -25,18 +25,23 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" + #include "src/core/helpers/Utils.h" #include -void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items) +void arm_compute::enqueue(cl::CommandQueue &queue, + ICLKernel &kernel, + const Window &window, + const cl::NDRange &lws_hint, + bool use_dummy_work_items) { - if(kernel.kernel()() == nullptr) + if (kernel.kernel()() == nullptr) { return; } - for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i) + for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_ERROR_ON(window[i].step() == 0); // Make sure that dimensions > Z are 1 @@ -46,7 +51,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items); // Check for empty NDRange - if(gws.dimensions() == 0) + if (gws.dimensions() == 0) { return; } @@ -54,7 +59,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind kernel.cache_gws(gws); cl::NDRange valid_lws; - if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) + if (lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) { valid_lws = cl::NullRange; } @@ -65,12 +70,12 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind cl::NDRange lws = cl::NullRange; - if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2])) + if ((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2])) { lws = valid_lws; } - if(CLKernelLibrary::get().is_wbsm_supported()) + if (CLKernelLibrary::get().is_wbsm_supported()) { set_wbsm(kernel.kernel(), kernel.wbsm_hint()); } @@ -90,7 +95,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons // Calculate offset to the start of the window unsigned int offset_first_element = info->offset_first_element_in_bytes(); - for(unsigned int n = 0; n < info->num_dimensions(); ++n) + for (unsigned int n = 0; n < info->num_dimensions(); ++n) { offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n]; } @@ -98,7 +103,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons unsigned int idx_start = idx; _kernel.setArg(idx++, tensor->cl_buffer()); - for(unsigned int d = 0; d < dimension_size; ++d) + for (unsigned int d = 0; d < dimension_size; ++d) { _kernel.setArg(idx++, window.is_broadcasted(d) ? 0 : strides[d]); _kernel.setArg(idx++, window.is_broadcasted(d) ? 0 : (strides[d] * window[d].step())); @@ -107,7 +112,8 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons _kernel.setArg(idx++, offset_first_element); ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_tensor() != idx, - "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor()); + "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", + dimension_size, num_arguments_per_tensor()); ARM_COMPUTE_UNUSED(idx_start); } @@ -178,7 +184,7 @@ void ICLKernel::set_target(cl::Device &device) size_t ICLKernel::get_max_workgroup_size() { - if(_max_workgroup_size == 0) + if (_max_workgroup_size == 0) { _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel); } @@ -187,7 +193,7 @@ size_t ICLKernel::get_max_workgroup_size() cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items) { - if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0) + if ((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0) { return cl::NullRange; } @@ -196,7 +202,7 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work (window.y().end() - window.y().start()) / window.y().step(), (window.z().end() - window.z().start()) / window.z().step()); - if(use_dummy_work_items) + if (use_dummy_work_items) { gws.get()[0] = get_next_power_two(gws[0]); gws.get()[1] = get_next_power_two(gws[1]); diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h index c82809cef3..6aebef15a5 100644 --- a/src/core/CL/ICLKernel.h +++ b/src/core/CL/ICLKernel.h @@ -27,10 +27,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLTypes.h" #include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/IKernel.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/CL/CLTuningParams.h" #include "src/core/CL/DefaultLWSHeuristics.h" @@ -43,14 +43,14 @@ namespace { bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1) { - if(lws0.dimensions() != lws1.dimensions()) + if (lws0.dimensions() != lws1.dimensions()) { return false; } - for(size_t i = 0; i < lws0.dimensions(); ++i) + for (size_t i = 0; i < lws0.dimensions(); ++i) { - if(lws0.get()[i] != lws1.get()[i]) + if (lws0.get()[i] != lws1.get()[i]) { return false; } @@ -71,7 +71,7 @@ private: * * @return The number of arguments enqueued per array object. */ - template + template constexpr static unsigned int num_arguments_per_array() { return num_arguments_per_tensor(); @@ -80,7 +80,7 @@ private: * * @return The number of arguments enqueued per tensor object. */ - template + template constexpr static unsigned int num_arguments_per_tensor() { return 2 + 2 * dimension_size; @@ -116,11 +116,13 @@ protected: * @param[in] window The maximum window which will be returned by window() * @param[in] tuning_params_hint (Optional) Tuning parameters to use. */ - void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0)) + void configure_internal(const Window &window, + CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), + 0)) { _tuning_params_hint = tuning_params_hint; - if(is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange())) + if (is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange())) { // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which // will be recalculated with use_dummy_work_items flag at run time again anyway. @@ -133,7 +135,13 @@ protected: public: /** Constructor */ ICLKernel() - : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint(), _cached_gws(cl::NullRange) + : _kernel(nullptr), + _target(GPUTarget::MIDGARD), + _config_id(arm_compute::default_config_id), + _max_workgroup_size(0), + _type(CLKernelType::UNKNOWN), + _tuning_params_hint(), + _cached_gws(cl::NullRange) { } /** Returns a reference to the OpenCL kernel of this object. @@ -161,7 +169,11 @@ public: * @param[in] window Window the kernel will be executed on. */ template - void add_1D_array_argument(unsigned int &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window) + void add_1D_array_argument(unsigned int &idx, + const ICLArray *array, + const Strides &strides, + unsigned int num_dimensions, + const Window &window) { add_array_argument(idx, array, strides, num_dimensions, window); } @@ -184,7 +196,7 @@ public: */ void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window) { - if(cond) + if (cond) { add_1D_tensor_argument(idx, tensor, window); } @@ -208,7 +220,7 @@ public: */ void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window) { - if(cond) + if (cond) { add_2D_tensor_argument(idx, tensor, window); } @@ -469,7 +481,11 @@ private: * @param[in] window Window the kernel will be executed on. */ template - void add_array_argument(unsigned int &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window); + void add_array_argument(unsigned int &idx, + const ICLArray *array, + const Strides &strides, + unsigned int num_dimensions, + const Window &window); /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. @@ -505,7 +521,11 @@ private: * * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed. */ -void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false); +void enqueue(cl::CommandQueue &queue, + ICLKernel &kernel, + const Window &window, + const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), + bool use_dummy_work_items = false); /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx. * @@ -516,14 +536,15 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c * @param[in] window Window the kernel will be executed on. */ template -void ICLKernel::add_array_argument(unsigned &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window) +void ICLKernel::add_array_argument( + unsigned &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window) { ARM_COMPUTE_ERROR_ON(array == nullptr); // Calculate offset to the start of the window unsigned int offset_first_element = 0; - for(unsigned int n = 0; n < num_dimensions; ++n) + for (unsigned int n = 0; n < num_dimensions; ++n) { offset_first_element += window[n].start() * strides[n]; } @@ -531,7 +552,7 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray *array, cons unsigned int idx_start = idx; _kernel.setArg(idx++, array->cl_buffer()); - for(unsigned int dimension = 0; dimension < dimension_size; dimension++) + for (unsigned int dimension = 0; dimension < dimension_size; dimension++) { _kernel.setArg(idx++, strides[dimension]); _kernel.setArg(idx++, strides[dimension] * window[dimension].step()); @@ -540,8 +561,9 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray *array, cons _kernel.setArg(idx++, offset_first_element); ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array() != idx, - "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array()); + "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", + dimension_size, num_arguments_per_array()); ARM_COMPUTE_UNUSED(idx_start); } -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLKERNEL_H */ diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp index 5d8295bdfe..3f7edbb88d 100644 --- a/src/core/CL/ICLSimple2DKernel.cpp +++ b/src/core/CL/ICLSimple2DKernel.cpp @@ -40,6 +40,5 @@ void ICLSimple2DKernel::run(const Window &window, cl::CommandQueue &queue) add_2D_tensor_argument(idx, _input, slice); add_2D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } diff --git a/src/core/CL/ICLSimple2DKernel.h b/src/core/CL/ICLSimple2DKernel.h index 5246492401..97bc1e58c2 100644 --- a/src/core/CL/ICLSimple2DKernel.h +++ b/src/core/CL/ICLSimple2DKernel.h @@ -37,5 +37,5 @@ public: // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; }; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLSIMPLE2DKERNEL_H */ diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp index fef1a86125..71d7d1f07b 100644 --- a/src/core/CL/ICLSimple3DKernel.cpp +++ b/src/core/CL/ICLSimple3DKernel.cpp @@ -42,6 +42,5 @@ void ICLSimple3DKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } diff --git a/src/core/CL/ICLSimple3DKernel.h b/src/core/CL/ICLSimple3DKernel.h index ff0b274663..5071b6b339 100644 --- a/src/core/CL/ICLSimple3DKernel.h +++ b/src/core/CL/ICLSimple3DKernel.h @@ -39,5 +39,5 @@ public: // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; }; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLSIMPLE3DKERNEL_H */ diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp index d67fefdf71..c31db8355f 100644 --- a/src/core/CL/ICLSimpleKernel.cpp +++ b/src/core/CL/ICLSimpleKernel.cpp @@ -22,30 +22,35 @@ * SOFTWARE. */ #include "src/core/CL/ICLSimpleKernel.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/WindowHelpers.h" using namespace arm_compute; -ICLSimpleKernel::ICLSimpleKernel() - : _input(nullptr), _output(nullptr) +ICLSimpleKernel::ICLSimpleKernel() : _input(nullptr), _output(nullptr) { } -void ICLSimpleKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size) +void ICLSimpleKernel::configure(const ICLTensor *input, + ICLTensor *output, + unsigned int num_elems_processed_per_iteration, + bool border_undefined, + const BorderSize &border_size) { _input = input; _output = output; // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size); + Window win = + calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, - AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access); output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size); diff --git a/src/core/CL/ICLSimpleKernel.h b/src/core/CL/ICLSimpleKernel.h index b35547a217..6afd7309aa 100644 --- a/src/core/CL/ICLSimpleKernel.h +++ b/src/core/CL/ICLSimpleKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -55,12 +56,16 @@ public: * @param[in] border_undefined (Optional) True if the border mode is undefined. False if it's replicate or constant. * @param[in] border_size (Optional) Size of the border. */ - void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize()); + void configure(const ICLTensor *input, + ICLTensor *output, + unsigned int num_elems_processed_per_iteration, + bool border_undefined = false, + const BorderSize &border_size = BorderSize()); protected: const ICLTensor *_input; ICLTensor *_output; }; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLSIMPLEKERNEL_H */ diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp index b541bff04a..0771db7f50 100644 --- a/src/core/CL/ICLTensor.cpp +++ b/src/core/CL/ICLTensor.cpp @@ -27,8 +27,7 @@ using namespace arm_compute; -ICLTensor::ICLTensor() - : _mapping(nullptr) +ICLTensor::ICLTensor() : _mapping(nullptr) { } diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index b092dfb4e2..35421d025e 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -36,11 +36,7 @@ namespace arm_compute { -CLSymbols::CLSymbols() noexcept(false) - : _loaded( -{ - false, false -}) +CLSymbols::CLSymbols() noexcept(false) : _loaded({false, false}) { } @@ -52,9 +48,9 @@ CLSymbols &CLSymbols::get() bool CLSymbols::load_default() { - static const std::vector libraries_filenames{ "libOpenCL.so", "libGLES_mali.so", "libmali.so" }; + static const std::vector libraries_filenames{"libOpenCL.so", "libGLES_mali.so", "libmali.so"}; - if(_loaded.first) + if (_loaded.first) { return _loaded.second; } @@ -62,34 +58,32 @@ bool CLSymbols::load_default() // Indicate that default loading has been tried _loaded.first = true; - if(load(libraries_filenames, /* use_loader */ false)) + if (load(libraries_filenames, /* use_loader */ false)) { - ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from shared library"); + ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, + "Failed to load OpenCL symbols from shared library"); return true; } #ifdef __ANDROID__ // When running in NDK environment, the above libraries are not accessible. - static const std::vector android_libraries_filenames{ "libOpenCL-pixel.so", "libOpenCL-car.so" }; + static const std::vector android_libraries_filenames{"libOpenCL-pixel.so", "libOpenCL-car.so"}; - if(load(android_libraries_filenames, /* use_loader */ true)) + if (load(android_libraries_filenames, /* use_loader */ true)) { - ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from android shared library"); + ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, + "Failed to load OpenCL symbols from android shared library"); return true; } #endif // __ANDROID__ // If not returned till here then libraries not found std::stringstream ss; - std::for_each(libraries_filenames.begin(), libraries_filenames.end(), [&ss](const std::string & s) - { - ss << s << " "; - }); + std::for_each(libraries_filenames.begin(), libraries_filenames.end(), + [&ss](const std::string &s) { ss << s << " "; }); #ifdef __ANDROID__ - std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(), [&ss](const std::string & s) - { - ss << s << " "; - }); + std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(), + [&ss](const std::string &s) { ss << s << " "; }); #endif // __ANDROID__ std::cerr << "Couldn't find any of the following OpenCL library: " << ss.str() << std::endl; return false; @@ -99,15 +93,15 @@ bool CLSymbols::load(const std::vector &libraries_filenames, bool u { void *handle = nullptr; unsigned int index = 0; - for(index = 0; index < libraries_filenames.size(); ++index) + for (index = 0; index < libraries_filenames.size(); ++index) { handle = dlopen(libraries_filenames[index].c_str(), RTLD_LAZY | RTLD_LOCAL); - if(handle != nullptr) + if (handle != nullptr) { break; } } - if(index == libraries_filenames.size()) + if (index == libraries_filenames.size()) { // Set status of loading to failed _loaded.second = false; @@ -115,22 +109,23 @@ bool CLSymbols::load(const std::vector &libraries_filenames, bool u } #ifdef __ANDROID__ - typedef void* (*loadOpenCLPointer_t)(const char* name); + typedef void *(*loadOpenCLPointer_t)(const char *name); loadOpenCLPointer_t loadOpenCLPointer; - if (use_loader) { + if (use_loader) + { typedef void (*enableOpenCL_t)(); - enableOpenCL_t enableOpenCL = - reinterpret_cast(dlsym(handle, "enableOpenCL")); + enableOpenCL_t enableOpenCL = reinterpret_cast(dlsym(handle, "enableOpenCL")); enableOpenCL(); - loadOpenCLPointer = reinterpret_cast( - dlsym(handle, "loadOpenCLPointer")); - } else { + loadOpenCLPointer = reinterpret_cast(dlsym(handle, "loadOpenCLPointer")); + } + else + { loadOpenCLPointer = nullptr; } -#define LOAD_FUNCTION_PTR(func_name, _handle) \ - func_name##_ptr = reinterpret_cast( use_loader ? \ - loadOpenCLPointer(#func_name) : dlsym(handle, #func_name)); +#define LOAD_FUNCTION_PTR(func_name, _handle) \ + func_name##_ptr = reinterpret_cast(use_loader ? loadOpenCLPointer(#func_name) \ + : dlsym(handle, #func_name)); #else /* __ANDROID__ */ (void)use_loader; // Avoid unused warning #define LOAD_FUNCTION_PTR(func_name, handle) \ @@ -234,12 +229,11 @@ bool opencl_is_available() } } // namespace arm_compute -cl_int clEnqueueMarker(cl_command_queue command_queue, - cl_event *event) +cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueMarker_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, event); } @@ -249,12 +243,11 @@ cl_int clEnqueueMarker(cl_command_queue command_queue, } } -cl_int clWaitForEvents(cl_uint num_events, - const cl_event *event_list) +cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clWaitForEvents_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_events, event_list); } @@ -264,12 +257,18 @@ cl_int clWaitForEvents(cl_uint num_events, } } -cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void *svm_ptr, - size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) +cl_int clEnqueueSVMMap(cl_command_queue command_queue, + cl_bool blocking_map, + cl_map_flags flags, + void *svm_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueSVMMap_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event); } @@ -279,12 +278,15 @@ cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_ } } -cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event) +cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, + void *svm_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueSVMUnmap_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event); } @@ -298,7 +300,7 @@ void *clSVMAlloc(cl_context context, cl_svm_mem_flags_arm flags, size_t size, cl { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSVMAlloc_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, size, alignment); } @@ -312,7 +314,7 @@ void clSVMFree(cl_context context, void *svm_pointer) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSVMFree_ptr; - if(func != nullptr) + if (func != nullptr) { func(context, svm_pointer); } @@ -326,7 +328,7 @@ cl_int clGetContextInfo(cl_context context, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetContextInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, param_name, param_value_size, param_value, param_value_size_ret); } @@ -343,7 +345,7 @@ cl_command_queue clCreateCommandQueue(cl_context context, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateCommandQueue_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, device, properties, errcode_ret); } @@ -360,7 +362,7 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context c { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateCommandQueueWithProperties_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, device, properties, errcode_ret); } @@ -370,17 +372,16 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context c } } -cl_context clCreateContext( - const cl_context_properties *properties, - cl_uint num_devices, - const cl_device_id *devices, - void (*pfn_notify)(const char *, const void *, size_t, void *), - void *user_data, - cl_int *errcode_ret) +cl_context clCreateContext(const cl_context_properties *properties, + cl_uint num_devices, + const cl_device_id *devices, + void (*pfn_notify)(const char *, const void *, size_t, void *), + void *user_data, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateContext_ptr; - if(func != nullptr) + if (func != nullptr) { return func(properties, num_devices, devices, pfn_notify, user_data, errcode_ret); } @@ -398,7 +399,7 @@ cl_context clCreateContextFromType(const cl_context_properties *properties, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateContextFromType_ptr; - if(func != nullptr) + if (func != nullptr) { return func(properties, device_type, pfn_notify, user_data, errcode_ret); } @@ -408,17 +409,16 @@ cl_context clCreateContextFromType(const cl_context_properties *properties, } } -cl_int clBuildProgram( - cl_program program, - cl_uint num_devices, - const cl_device_id *device_list, - const char *options, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data) +cl_int clBuildProgram(cl_program program, + cl_uint num_devices, + const cl_device_id *device_list, + const char *options, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clBuildProgram_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, num_devices, device_list, options, pfn_notify, user_data); } @@ -428,22 +428,22 @@ cl_int clBuildProgram( } } -cl_int clEnqueueNDRangeKernel( - cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t *global_work_offset, - const size_t *global_work_size, - const size_t *local_work_size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); + return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, + num_events_in_wait_list, event_wait_list, event); } else { @@ -451,15 +451,11 @@ cl_int clEnqueueNDRangeKernel( } } -cl_int clSetKernelArg( - cl_kernel kernel, - cl_uint arg_index, - size_t arg_size, - const void *arg_value) +cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSetKernelArg_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, arg_index, arg_size, arg_value); } @@ -473,7 +469,7 @@ cl_int clRetainMemObject(cl_mem memobj) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainMemObject_ptr; - if(func != nullptr) + if (func != nullptr) { return func(memobj); } @@ -487,7 +483,7 @@ cl_int clReleaseMemObject(cl_mem memobj) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseMemObject_ptr; - if(func != nullptr) + if (func != nullptr) { return func(memobj); } @@ -497,17 +493,16 @@ cl_int clReleaseMemObject(cl_mem memobj) } } -cl_int clEnqueueUnmapMemObject( - cl_command_queue command_queue, - cl_mem memobj, - void *mapped_ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue, + cl_mem memobj, + void *mapped_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event); } @@ -521,7 +516,7 @@ cl_int clRetainCommandQueue(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainCommandQueue_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -535,7 +530,7 @@ cl_int clReleaseContext(cl_context context) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseContext_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context); } @@ -548,7 +543,7 @@ cl_int clReleaseEvent(cl_event event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseEvent_ptr; - if(func != nullptr) + if (func != nullptr) { return func(event); } @@ -558,22 +553,22 @@ cl_int clReleaseEvent(cl_event event) } } -cl_int clEnqueueWriteBuffer( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_write, - size_t offset, - size_t size, - const void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueWriteBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + size_t offset, + size_t size, + const void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); + return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, + event); } else { @@ -581,22 +576,22 @@ cl_int clEnqueueWriteBuffer( } } -cl_int clEnqueueReadBuffer( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - size_t offset, - size_t size, - void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueReadBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + size_t offset, + size_t size, + void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); + return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, + event); } else { @@ -604,17 +599,16 @@ cl_int clEnqueueReadBuffer( } } -cl_int clGetProgramBuildInfo( - cl_program program, - cl_device_id device, - cl_program_build_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetProgramBuildInfo(cl_program program, + cl_device_id device, + cl_program_build_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, device, param_name, param_value_size, param_value, param_value_size_ret); } @@ -628,7 +622,7 @@ cl_int clRetainProgram(cl_program program) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainProgram_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program); } @@ -638,27 +632,27 @@ cl_int clRetainProgram(cl_program program) } } -void *clEnqueueMapBuffer( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_map, - cl_map_flags map_flags, - size_t offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event, - cl_int *errcode_ret) +void *clEnqueueMapBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_map, + cl_map_flags map_flags, + size_t offset, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret); + return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, + event_wait_list, event, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -670,7 +664,7 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -680,24 +674,23 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue) } } -cl_program clCreateProgramWithBinary( - cl_context context, - cl_uint num_devices, - const cl_device_id *device_list, - const size_t *lengths, - const unsigned char **binaries, - cl_int *binary_status, - cl_int *errcode_ret) +cl_program clCreateProgramWithBinary(cl_context context, + cl_uint num_devices, + const cl_device_id *device_list, + const size_t *lengths, + const unsigned char **binaries, + cl_int *binary_status, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -709,7 +702,7 @@ cl_int clRetainContext(cl_context context) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainContext_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context); } @@ -723,7 +716,7 @@ cl_int clReleaseProgram(cl_program program) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseProgram_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program); } @@ -737,7 +730,7 @@ cl_int clFlush(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clFlush_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -751,7 +744,7 @@ cl_int clFinish(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clFinish_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -761,16 +754,15 @@ cl_int clFinish(cl_command_queue command_queue) } } -cl_int clGetProgramInfo( - cl_program program, - cl_program_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetProgramInfo(cl_program program, + cl_program_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetProgramInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, param_name, param_value_size, param_value, param_value_size_ret); } @@ -780,20 +772,17 @@ cl_int clGetProgramInfo( } } -cl_kernel clCreateKernel( - cl_program program, - const char *kernel_name, - cl_int *errcode_ret) +cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateKernel_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, kernel_name, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -805,7 +794,7 @@ cl_int clRetainKernel(cl_kernel kernel) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainKernel_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel); } @@ -815,22 +804,17 @@ cl_int clRetainKernel(cl_kernel kernel) } } -cl_mem clCreateBuffer( - cl_context context, - cl_mem_flags flags, - size_t size, - void *host_ptr, - cl_int *errcode_ret) +cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, size, host_ptr, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -839,21 +823,17 @@ cl_mem clCreateBuffer( } cl_program clCreateProgramWithSource( - cl_context context, - cl_uint count, - const char **strings, - const size_t *lengths, - cl_int *errcode_ret) + cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, count, strings, lengths, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -865,7 +845,7 @@ cl_int clReleaseKernel(cl_kernel kernel) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseKernel_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel); } @@ -878,12 +858,12 @@ cl_int clReleaseKernel(cl_kernel kernel) cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, - cl_device_id *devices, + cl_device_id *devices, cl_uint *num_devices) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetDeviceIDs_ptr; - if(func != nullptr) + if (func != nullptr) { return func(platform, device_type, num_entries, devices, num_devices); } @@ -901,7 +881,7 @@ cl_int clGetDeviceInfo(cl_device_id device, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetDeviceInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(device, param_name, param_value_size, param_value, param_value_size_ret); } @@ -911,15 +891,12 @@ cl_int clGetDeviceInfo(cl_device_id device, } } -cl_int clGetMemObjectInfo(cl_mem memobj, - cl_mem_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetMemObjectInfo( + cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetMemObjectInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(memobj, param_name, param_value_size, param_value, param_value_size_ret); } @@ -933,7 +910,7 @@ cl_int clRetainEvent(cl_event event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainEvent_ptr; - if(func != nullptr) + if (func != nullptr) { return func(event); } @@ -951,7 +928,7 @@ cl_int clGetPlatformInfo(cl_platform_id platform, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetPlatformInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(platform, param_name, param_value_size, param_value, param_value_size_ret); } @@ -965,7 +942,7 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetPlatformIDs_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_entries, platforms, num_platforms); } @@ -975,17 +952,16 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint } } -cl_int -clGetKernelWorkGroupInfo(cl_kernel kernel, - cl_device_id device, - cl_kernel_work_group_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetKernelWorkGroupInfo(cl_kernel kernel, + cl_device_id device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret); } @@ -995,16 +971,15 @@ clGetKernelWorkGroupInfo(cl_kernel kernel, } } -cl_int -clGetCommandQueueInfo(cl_command_queue command_queue, - cl_command_queue_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetCommandQueueInfo(cl_command_queue command_queue, + cl_command_queue_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret); } @@ -1014,16 +989,15 @@ clGetCommandQueueInfo(cl_command_queue command_queue, } } -cl_int -clGetKernelInfo(cl_kernel kernel, - cl_kernel_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetKernelInfo(cl_kernel kernel, + cl_kernel_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, param_name, param_value_size, param_value, param_value_size_ret); } @@ -1033,16 +1007,15 @@ clGetKernelInfo(cl_kernel kernel, } } -cl_int -clGetEventProfilingInfo(cl_event event, - cl_profiling_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetEventProfilingInfo(cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(event, param_name, param_value_size, param_value, param_value_size_ret); } @@ -1052,23 +1025,22 @@ clGetEventProfilingInfo(cl_event event, } } -cl_mem -clCreateImage(cl_context context, - cl_mem_flags flags, - const cl_image_format *image_format, - const cl_image_desc *image_desc, - void *host_ptr, - cl_int *errcode_ret) +cl_mem clCreateImage(cl_context context, + cl_mem_flags flags, + const cl_image_format *image_format, + const cl_image_desc *image_desc, + void *host_ptr, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateImage_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, image_format, image_desc, host_ptr, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -1076,14 +1048,12 @@ clCreateImage(cl_context context, } } -cl_int clSetKernelExecInfo(cl_kernel kernel, - cl_kernel_exec_info param_name, - size_t param_value_size, - const void *param_value) +cl_int +clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void *param_value) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, param_name, param_value_size, param_value); } @@ -1093,22 +1063,21 @@ cl_int clSetKernelExecInfo(cl_kernel kernel, } } -cl_command_buffer_khr clCreateCommandBufferKHR( - cl_uint num_queues, - const cl_command_queue* queues, - const cl_command_buffer_properties_khr* properties, - cl_int* errcode_ret) +cl_command_buffer_khr clCreateCommandBufferKHR(cl_uint num_queues, + const cl_command_queue *queues, + const cl_command_buffer_properties_khr *properties, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clCreateCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_queues, queues, properties, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_INVALID_OPERATION; } @@ -1122,7 +1091,7 @@ cl_int clFinalizeCommandBufferKHR(cl_command_buffer_khr command_buffer) arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clFinalizeCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer); } @@ -1137,7 +1106,7 @@ cl_int clRetainCommandBufferKHR(cl_command_buffer_khr command_buffer) arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clRetainCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer); } @@ -1152,7 +1121,7 @@ cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer) arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clReleaseCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer); } @@ -1162,18 +1131,17 @@ cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer) } } -cl_int clEnqueueCommandBufferKHR( - cl_uint num_queues, - cl_command_queue* queues, - cl_command_buffer_khr command_buffer, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) +cl_int clEnqueueCommandBufferKHR(cl_uint num_queues, + cl_command_queue *queues, + cl_command_buffer_khr command_buffer, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clEnqueueCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_queues, queues, command_buffer, num_events_in_wait_list, event_wait_list, event); } @@ -1183,27 +1151,26 @@ cl_int clEnqueueCommandBufferKHR( } } - -cl_int clCommandNDRangeKernelKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - const cl_ndrange_kernel_command_properties_khr* properties, - cl_kernel kernel, - cl_uint work_dim, - const size_t* global_work_offset, - const size_t* global_work_size, - const size_t* local_work_size, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) +cl_int clCommandNDRangeKernelKHR(cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + const cl_ndrange_kernel_command_properties_khr *properties, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr *sync_point_wait_list, + cl_sync_point_khr *sync_point, + cl_mutable_command_khr *mutable_handle) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clCommandNDRangeKernelKHR_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle); + return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size, + local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle); } else { @@ -1211,14 +1178,13 @@ cl_int clCommandNDRangeKernelKHR( } } -cl_int clUpdateMutableCommandsKHR( - cl_command_buffer_khr command_buffer, - const cl_mutable_base_config_khr* mutable_config) +cl_int clUpdateMutableCommandsKHR(cl_command_buffer_khr command_buffer, + const cl_mutable_base_config_khr *mutable_config) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clUpdateMutableCommandsKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer, mutable_config); } @@ -1228,23 +1194,22 @@ cl_int clUpdateMutableCommandsKHR( } } -cl_mem -clImportMemoryARM(cl_context context, - cl_mem_flags flags, - const cl_import_properties_arm *properties, - void *memory, - size_t size, - cl_int *errcode_ret) +cl_mem clImportMemoryARM(cl_context context, + cl_mem_flags flags, + const cl_import_properties_arm *properties, + void *memory, + size_t size, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clImportMemoryARM_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, properties, memory, size, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h index 3f93c8d6fc..02faae2369 100644 --- a/src/core/CL/cl_kernels/activation_float_helpers.h +++ b/src/core/CL/cl_kernels/activation_float_helpers.h @@ -31,7 +31,8 @@ #endif // GPU_ARCH == GPU_ARCH_BIFROST // Hard-Swish -#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) +#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) // Logistic Activation #define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) @@ -49,13 +50,16 @@ #define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) // Leaky RELU Activation -#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) +#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) // Soft RELU Activation #define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) // ELU Activation -#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) +#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, \ + (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) // Absolute Activation #define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x)) @@ -70,7 +74,8 @@ #define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) // GELU Activation -#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) +#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) // Identity Activation #define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x) diff --git a/src/core/CL/cl_kernels/activation_quant_helpers.h b/src/core/CL/cl_kernels/activation_quant_helpers.h index c420578546..c758ff1278 100644 --- a/src/core/CL/cl_kernels/activation_quant_helpers.h +++ b/src/core/CL/cl_kernels/activation_quant_helpers.h @@ -60,17 +60,17 @@ inline TYPE identiy_op(TYPE x) } #define ACTIVATION_OP2(op, x) op##_op(x) -#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) +#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) #if defined(S1_VAL) && defined(S2_VAL) #if defined(O1_VAL) && defined(O2_VAL) #define PERFORM_ACTIVATION_QUANT(act, data) \ ({ \ data = ACTIVATION_OP(act, data); \ - \ + \ VEC_DATA_TYPE(float, VEC_SIZE) \ fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \ - \ + \ fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \ data = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \ }) @@ -78,17 +78,14 @@ inline TYPE identiy_op(TYPE x) #define PERFORM_ACTIVATION_QUANT(act, data) \ ({ \ data = ACTIVATION_OP(act, data); \ - \ + \ VEC_DATA_TYPE(float, VEC_SIZE) \ fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \ - \ + \ fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL)); \ data = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \ }) #endif /* defined(O1_VAL) && defined(O2_VAL) */ #else /* defined(S1_VAL) && defined(S2_VAL) */ -#define PERFORM_ACTIVATION_QUANT(act, data) \ - ({ \ - data = ACTIVATION_OP(act, data); \ - }) +#define PERFORM_ACTIVATION_QUANT(act, data) ({ data = ACTIVATION_OP(act, data); }) #endif /* defined(S1_VAL) && defined(S2_VAL) */ diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h index 0e938cb668..4bef02314f 100644 --- a/src/core/CL/cl_kernels/gemm_helpers.h +++ b/src/core/CL/cl_kernels/gemm_helpers.h @@ -34,14 +34,14 @@ * */ #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x) -#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) +#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) // offset == 0 -#define scalar_access_0_1(x) ((x).s0) -#define scalar_access_0_2(x) ((x).s01) -#define scalar_access_0_3(x) ((x).s012) -#define scalar_access_0_4(x) ((x).s0123) -#define scalar_access_0_8(x) ((x).s01234567) +#define scalar_access_0_1(x) ((x).s0) +#define scalar_access_0_2(x) ((x).s01) +#define scalar_access_0_3(x) ((x).s012) +#define scalar_access_0_4(x) ((x).s0123) +#define scalar_access_0_8(x) ((x).s01234567) #define scalar_access_0_16(x) ((x).s0123456789ABCDEF) // offset == 1 @@ -100,8 +100,7 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ - ({}) +#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) ({}) #define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); @@ -186,8 +185,10 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) -#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) +#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ + LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) +#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ + LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) /** @} */ // end of group LOAD_TENSOR /** Load 2D tensor (consecutive rows and columns) with Z offset. @@ -202,8 +203,7 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ - ({}) +#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) ({}) #define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); @@ -279,8 +279,10 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** @}*/ // end of group LOAD_TENSOR_M0XN0 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). @@ -395,8 +397,10 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) -#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) /** @} */ // end of group LOAD_BLOCK /** Partially load the 0 to (n-1)th rows of the given variables @@ -517,8 +521,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) -#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) /** Load a block that can be partial in both x and y dimensions * * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty. @@ -541,22 +547,23 @@ * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0. * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0. */ -#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ - { \ - LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else \ - { \ - LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ +#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ + { \ + LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else \ + { \ + LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ } /** Load a block that can only be partial in x but not y. * @@ -578,14 +585,15 @@ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0. */ -#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else \ - { \ - LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ +#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, \ + PARTIAL_COND_X) \ + if (!(PARTIAL_COND_X)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else \ + { \ + LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ } /** Load a block that can only be partial in y but not x. * @@ -607,14 +615,15 @@ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. */ -#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ - if(!(PARTIAL_COND_Y)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else \ - { \ - LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ +#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_COND_Y) \ + if (!(PARTIAL_COND_Y)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else \ + { \ + LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ } /** @} */ // end of group LOAD_BLOCK_PARTIAL /** Boundary-aware GeMM block load @@ -676,28 +685,33 @@ */ #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case1: No partial blocks in either x or y -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 // Case2: Partial blocks in y -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 // Case3: Partial blocks in x -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case4: Partial blocks in both x and y -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ - LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ + LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) -#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 +#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 /** @} */ // end of group LOAD_BLOCK_BOUNDARY_AWARE /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). @@ -795,8 +809,10 @@ * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels) * @{ */ -#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) -#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) +#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) +#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) /** @} */ // end of group LOAD_TEXTURE2D /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded. @@ -815,7 +831,7 @@ #define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##0; \ - if(Y_MASK##0 != 0) \ + if (Y_MASK##0 != 0) \ BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \ else \ BASENAME##0 = 0; @@ -824,7 +840,7 @@ LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##1; \ - if(Y_MASK##1 != 0) \ + if (Y_MASK##1 != 0) \ BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \ else \ BASENAME##1 = 0; @@ -833,7 +849,7 @@ LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##2; \ - if(Y_MASK##2 != 0) \ + if (Y_MASK##2 != 0) \ BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \ else \ BASENAME##2 = 0; @@ -842,7 +858,7 @@ LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##3; \ - if(Y_MASK##3 != 0) \ + if (Y_MASK##3 != 0) \ BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \ else \ BASENAME##3 = 0; @@ -851,7 +867,7 @@ LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##4; \ - if(Y_MASK##4 != 0) \ + if (Y_MASK##4 != 0) \ BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \ else \ BASENAME##4 = 0; @@ -860,7 +876,7 @@ LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##5; \ - if(Y_MASK##5 != 0) \ + if (Y_MASK##5 != 0) \ BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \ else \ BASENAME##5 = 0; @@ -869,7 +885,7 @@ LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##6; \ - if(Y_MASK##6 != 0) \ + if (Y_MASK##6 != 0) \ BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \ else \ BASENAME##6 = 0; @@ -878,7 +894,7 @@ LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##7; \ - if(Y_MASK##7 != 0) \ + if (Y_MASK##7 != 0) \ BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \ else \ BASENAME##7 = 0; @@ -887,7 +903,7 @@ LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##8; \ - if(Y_MASK##8 != 0) \ + if (Y_MASK##8 != 0) \ BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \ else \ BASENAME##8 = 0; @@ -896,7 +912,7 @@ LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##9; \ - if(Y_MASK##9 != 0) \ + if (Y_MASK##9 != 0) \ BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \ else \ BASENAME##9 = 0; @@ -905,7 +921,7 @@ LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##A; \ - if(Y_MASK##A != 0) \ + if (Y_MASK##A != 0) \ BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \ else \ BASENAME##A = 0; @@ -914,7 +930,7 @@ LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##B; \ - if(Y_MASK##B != 0) \ + if (Y_MASK##B != 0) \ BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \ else \ BASENAME##B = 0; @@ -923,7 +939,7 @@ LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##C; \ - if(Y_MASK##C != 0) \ + if (Y_MASK##C != 0) \ BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \ else \ BASENAME##C = 0; @@ -932,7 +948,7 @@ LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##D; \ - if(Y_MASK##D != 0) \ + if (Y_MASK##D != 0) \ BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \ else \ BASENAME##D = 0; @@ -941,7 +957,7 @@ LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##E; \ - if(Y_MASK##E != 0) \ + if (Y_MASK##E != 0) \ BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \ else \ BASENAME##E = 0; @@ -950,7 +966,7 @@ LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##F; \ - if(Y_MASK##F != 0) \ + if (Y_MASK##F != 0) \ BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \ else \ BASENAME##F = 0; @@ -976,8 +992,10 @@ * @param[in] Y_MASK The y-axis mask vector. If 0, forces BASENAMEn to 0 * @{ */ -#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) -#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) +#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ + LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) +#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ + LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) /** @} */ // end of group LOAD_BLOCK_INDIRECT /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). @@ -1088,8 +1106,10 @@ * @param[in] STRIDE_Y The stride in y-axis direction * @{ */ -#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) -#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) +#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) +#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) /** @} */ // end of group LOAD_SCALAR_AS_VECTOR /** Basic macros to calculate Z offset values from Z0 to Zn-1 @@ -1187,8 +1207,10 @@ * @param[in] STRIDE_Y The stride value in y-axis direction * @{ */ -#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) -#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) +#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ + CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) +#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ + CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) /** @} */ // end of group CALCULATE_Z_OFFSET /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1) @@ -1199,8 +1221,7 @@ * @param[in] SCALE The scale factor * @{ */ -#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ - BASENAME##0 *= (DATA_TYPE)SCALE; +#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE; #define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ @@ -1275,7 +1296,7 @@ * @{ */ #define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) -#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) +#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) /** @} */ // end of group SCALE_BLOCK /** Create a new vector containing the values at the given index for a set of given vectors @@ -1287,8 +1308,7 @@ * @param[in] TYPE The data type of the destination vectors * @{ */ -#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ - TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); +#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); #define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 2) \ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); @@ -1297,13 +1317,20 @@ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); #define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 4) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); -#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ - VEC_DATA_TYPE(TYPE, 8) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); -#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ - VEC_DATA_TYPE(TYPE, 16) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); +#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 8) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \ + (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); +#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 16) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \ + (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, \ + (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, \ + (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); /** @} */ // end of group COLUMN_VECTORn /** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector @@ -1315,8 +1342,7 @@ * @param[in] TYPE The data type of the destination vectors * @{ */ -#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \ - TYPE BASENAME##IDX_COL = (TYPE)((X##0)); +#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0)); #define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 2) \ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); @@ -1329,9 +1355,10 @@ #define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 8) \ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); -#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ - VEC_DATA_TYPE(TYPE, 16) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); +#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 16) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \ + (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); /** @} */ // end of group COLUMN_VECTOR_SCALARn /** Create transposed vectors of the given vectors @@ -1343,8 +1370,7 @@ * @param[in] TYPE The data type of the transposed vectors * @{ */ -#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \ - COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); +#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); #define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \ COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \ COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE); @@ -1417,8 +1443,7 @@ * @param[in] BIAS The basename of the added variables * @{ */ -#define ADD_ROW_1(BASENAME, BIAS) \ - BASENAME##0 += BIAS##0; +#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0; #define ADD_ROW_2(BASENAME, BIAS) \ ADD_ROW_1(BASENAME, BIAS) \ @@ -1493,7 +1518,7 @@ * @{ */ #define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) -#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) +#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) /** @} */ // end of group ADD_BLOCK /** Broadcast (add single value) to the each element of the destination variables @@ -1503,8 +1528,7 @@ * @param[in] BIAS The variable containing the value to add * @{ */ -#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ - BASENAME##0 += BIAS; +#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS; #define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ @@ -1578,7 +1602,7 @@ * @{ */ #define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) -#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) +#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) /** @} */ // end of group ADD_BLOCK_BROADCAST /** Apply activation to the given variables @@ -1668,8 +1692,10 @@ * @param[in] B_VAL Additional value required by the activation * @{ */ -#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) -#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) +#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) +#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) /** @} */ // end of group ACTIVATION_BLOCK /** Apply convert_ to the given variables @@ -1773,6 +1799,8 @@ * @param[in] BASENAME_DST The basename of the destination variables * @{ */ -#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) -#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) +#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) +#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) /** @} */ // end of group CONVERT_BLOCK diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index b2ceaf92f3..87a1875f93 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -81,11 +81,11 @@ * @return The reversed vector * @{ */ -#define REV1(x) ((x)) -#define REV2(x) ((x).s10) -#define REV3(x) ((x).s210) -#define REV4(x) ((x).s3210) -#define REV8(x) ((x).s76543210) +#define REV1(x) ((x)) +#define REV2(x) ((x).s10) +#define REV3(x) ((x).s210) +#define REV4(x) ((x).s3210) +#define REV8(x) ((x).s76543210) #define REV16(x) ((x).sFEDCBA9876543210) /** @} */ // end of group REVn @@ -99,7 +99,7 @@ * @{ */ #define REVERSE_STR(x, s) REV##s((x)) -#define REVERSE(x, s) REVERSE_STR(x, s) +#define REVERSE(x, s) REVERSE_STR(x, s) /** @} */ // end of group REVERSE /** Circular-right-shift (rotate-right) the vector of size s by the amount of n. @@ -138,16 +138,16 @@ #define ROT8_7(x) ((x).s12345670) #define ROT8_8(x) ((x)) -#define ROT16_0(x) ((x)) -#define ROT16_1(x) ((x).sF0123456789ABCDE) -#define ROT16_2(x) ((x).sEF0123456789ABCD) -#define ROT16_3(x) ((x).sDEF0123456789ABC) -#define ROT16_4(x) ((x).sCDEF0123456789AB) -#define ROT16_5(x) ((x).sBCDEF0123456789A) -#define ROT16_6(x) ((x).sABCDEF0123456789) -#define ROT16_7(x) ((x).s9ABCDEF012345678) -#define ROT16_8(x) ((x).s89ABCDEF01234567) -#define ROT16_9(x) ((x).s789ABCDEF0123456) +#define ROT16_0(x) ((x)) +#define ROT16_1(x) ((x).sF0123456789ABCDE) +#define ROT16_2(x) ((x).sEF0123456789ABCD) +#define ROT16_3(x) ((x).sDEF0123456789ABC) +#define ROT16_4(x) ((x).sCDEF0123456789AB) +#define ROT16_5(x) ((x).sBCDEF0123456789A) +#define ROT16_6(x) ((x).sABCDEF0123456789) +#define ROT16_7(x) ((x).s9ABCDEF012345678) +#define ROT16_8(x) ((x).s89ABCDEF01234567) +#define ROT16_9(x) ((x).s789ABCDEF0123456) #define ROT16_10(x) ((x).s6789ABCDEF012345) #define ROT16_11(x) ((x).s56789ABCDEF01234) #define ROT16_12(x) ((x).s456789ABCDEF0123) @@ -168,7 +168,7 @@ * @{ */ #define ROTATE_STR(x, s, n) ROT##s##_##n(x) -#define ROTATE(x, s, n) ROTATE_STR(x, s, n) +#define ROTATE(x, s, n) ROTATE_STR(x, s, n) /** @} */ // end of group ROTATE /** Creates a vector of size n filled with offset values corresponding to the location of each element. @@ -179,11 +179,11 @@ * @return The vector filled with offset values * @{ */ -#define V_OFFS1(dt) (dt##1)(0) -#define V_OFFS2(dt) (dt##2)(0, 1) -#define V_OFFS3(dt) (dt##3)(0, 1, 2) -#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) -#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) +#define V_OFFS1(dt) (dt##1)(0) +#define V_OFFS2(dt) (dt##2)(0, 1) +#define V_OFFS3(dt) (dt##3)(0, 1, 2) +#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) +#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) #define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) /** @} */ // end of group V_OFFSn @@ -197,11 +197,11 @@ * @{ */ #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) -#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) +#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) /** @} */ // end of group VEC_OFFS #define VLOAD_STR(size) vload##size -#define VLOAD(size) VLOAD_STR(size) +#define VLOAD(size) VLOAD_STR(size) /** Extended partial vload that correctly handles scalar values as well. * Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops @@ -219,23 +219,23 @@ * @{ */ #define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size -#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) +#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) #define NO_LOAD(data, offs, ptr) \ { \ } // Size == 1 (scalar) -#define vload_partial_1_0 NO_LOAD -#define vload_partial_1_1 vload1 -#define vload_partial_1_2 NO_LOAD -#define vload_partial_1_3 NO_LOAD -#define vload_partial_1_4 NO_LOAD -#define vload_partial_1_5 NO_LOAD -#define vload_partial_1_6 NO_LOAD -#define vload_partial_1_7 NO_LOAD -#define vload_partial_1_8 NO_LOAD -#define vload_partial_1_9 NO_LOAD +#define vload_partial_1_0 NO_LOAD +#define vload_partial_1_1 vload1 +#define vload_partial_1_2 NO_LOAD +#define vload_partial_1_3 NO_LOAD +#define vload_partial_1_4 NO_LOAD +#define vload_partial_1_5 NO_LOAD +#define vload_partial_1_6 NO_LOAD +#define vload_partial_1_7 NO_LOAD +#define vload_partial_1_8 NO_LOAD +#define vload_partial_1_9 NO_LOAD #define vload_partial_1_10 NO_LOAD #define vload_partial_1_11 NO_LOAD #define vload_partial_1_12 NO_LOAD @@ -244,16 +244,16 @@ #define vload_partial_1_15 NO_LOAD #define vload_partial_1_16 NO_LOAD // Size == 2 -#define vload_partial_2_0 NO_LOAD -#define vload_partial_2_1 vload_partial_1 -#define vload_partial_2_2 vload_partial_2 -#define vload_partial_2_3 NO_LOAD -#define vload_partial_2_4 NO_LOAD -#define vload_partial_2_5 NO_LOAD -#define vload_partial_2_6 NO_LOAD -#define vload_partial_2_7 NO_LOAD -#define vload_partial_2_8 NO_LOAD -#define vload_partial_2_9 NO_LOAD +#define vload_partial_2_0 NO_LOAD +#define vload_partial_2_1 vload_partial_1 +#define vload_partial_2_2 vload_partial_2 +#define vload_partial_2_3 NO_LOAD +#define vload_partial_2_4 NO_LOAD +#define vload_partial_2_5 NO_LOAD +#define vload_partial_2_6 NO_LOAD +#define vload_partial_2_7 NO_LOAD +#define vload_partial_2_8 NO_LOAD +#define vload_partial_2_9 NO_LOAD #define vload_partial_2_10 NO_LOAD #define vload_partial_2_11 NO_LOAD #define vload_partial_2_12 NO_LOAD @@ -262,16 +262,16 @@ #define vload_partial_2_15 NO_LOAD #define vload_partial_2_16 NO_LOAD // Size == 3 -#define vload_partial_3_0 NO_LOAD -#define vload_partial_3_1 vload_partial_1 -#define vload_partial_3_2 vload_partial_2 -#define vload_partial_3_3 vload_partial_3 -#define vload_partial_3_4 NO_LOAD -#define vload_partial_3_5 NO_LOAD -#define vload_partial_3_6 NO_LOAD -#define vload_partial_3_7 NO_LOAD -#define vload_partial_3_8 NO_LOAD -#define vload_partial_3_9 NO_LOAD +#define vload_partial_3_0 NO_LOAD +#define vload_partial_3_1 vload_partial_1 +#define vload_partial_3_2 vload_partial_2 +#define vload_partial_3_3 vload_partial_3 +#define vload_partial_3_4 NO_LOAD +#define vload_partial_3_5 NO_LOAD +#define vload_partial_3_6 NO_LOAD +#define vload_partial_3_7 NO_LOAD +#define vload_partial_3_8 NO_LOAD +#define vload_partial_3_9 NO_LOAD #define vload_partial_3_10 NO_LOAD #define vload_partial_3_11 NO_LOAD #define vload_partial_3_12 NO_LOAD @@ -280,16 +280,16 @@ #define vload_partial_3_15 NO_LOAD #define vload_partial_3_16 NO_LOAD // Size == 4 -#define vload_partial_4_0 NO_LOAD -#define vload_partial_4_1 vload_partial_1 -#define vload_partial_4_2 vload_partial_2 -#define vload_partial_4_3 vload_partial_3 -#define vload_partial_4_4 vload_partial_4 -#define vload_partial_4_5 NO_LOAD -#define vload_partial_4_6 NO_LOAD -#define vload_partial_4_7 NO_LOAD -#define vload_partial_4_8 NO_LOAD -#define vload_partial_4_9 NO_LOAD +#define vload_partial_4_0 NO_LOAD +#define vload_partial_4_1 vload_partial_1 +#define vload_partial_4_2 vload_partial_2 +#define vload_partial_4_3 vload_partial_3 +#define vload_partial_4_4 vload_partial_4 +#define vload_partial_4_5 NO_LOAD +#define vload_partial_4_6 NO_LOAD +#define vload_partial_4_7 NO_LOAD +#define vload_partial_4_8 NO_LOAD +#define vload_partial_4_9 NO_LOAD #define vload_partial_4_10 NO_LOAD #define vload_partial_4_11 NO_LOAD #define vload_partial_4_12 NO_LOAD @@ -298,16 +298,16 @@ #define vload_partial_4_15 NO_LOAD #define vload_partial_4_16 NO_LOAD // Size == 8 -#define vload_partial_8_0 NO_LOAD -#define vload_partial_8_1 vload_partial_1 -#define vload_partial_8_2 vload_partial_2 -#define vload_partial_8_3 vload_partial_3 -#define vload_partial_8_4 vload_partial_4 -#define vload_partial_8_5 vload_partial_5 -#define vload_partial_8_6 vload_partial_6 -#define vload_partial_8_7 vload_partial_7 -#define vload_partial_8_8 vload_partial_8 -#define vload_partial_8_9 NO_LOAD +#define vload_partial_8_0 NO_LOAD +#define vload_partial_8_1 vload_partial_1 +#define vload_partial_8_2 vload_partial_2 +#define vload_partial_8_3 vload_partial_3 +#define vload_partial_8_4 vload_partial_4 +#define vload_partial_8_5 vload_partial_5 +#define vload_partial_8_6 vload_partial_6 +#define vload_partial_8_7 vload_partial_7 +#define vload_partial_8_8 vload_partial_8 +#define vload_partial_8_9 NO_LOAD #define vload_partial_8_10 NO_LOAD #define vload_partial_8_11 NO_LOAD #define vload_partial_8_12 NO_LOAD @@ -316,16 +316,16 @@ #define vload_partial_8_15 NO_LOAD #define vload_partial_8_16 NO_LOAD // Size == 16 -#define vload_partial_16_0 NO_LOAD -#define vload_partial_16_1 vload_partial_1 -#define vload_partial_16_2 vload_partial_2 -#define vload_partial_16_3 vload_partial_3 -#define vload_partial_16_4 vload_partial_4 -#define vload_partial_16_5 vload_partial_5 -#define vload_partial_16_6 vload_partial_6 -#define vload_partial_16_7 vload_partial_7 -#define vload_partial_16_8 vload_partial_8 -#define vload_partial_16_9 vload_partial_9 +#define vload_partial_16_0 NO_LOAD +#define vload_partial_16_1 vload_partial_1 +#define vload_partial_16_2 vload_partial_2 +#define vload_partial_16_3 vload_partial_3 +#define vload_partial_16_4 vload_partial_4 +#define vload_partial_16_5 vload_partial_5 +#define vload_partial_16_6 vload_partial_6 +#define vload_partial_16_7 vload_partial_7 +#define vload_partial_16_8 vload_partial_8 +#define vload_partial_16_9 vload_partial_9 #define vload_partial_16_10 vload_partial_10 #define vload_partial_16_11 vload_partial_11 #define vload_partial_16_12 vload_partial_12 @@ -351,17 +351,13 @@ * @param[in] PTR The base pointer * @{ */ -#define vload_partial_1(DATA, OFFSET, PTR) \ - DATA.s0 = vload1(OFFSET, PTR); +#define vload_partial_1(DATA, OFFSET, PTR) DATA.s0 = vload1(OFFSET, PTR); -#define vload_partial_2(DATA, OFFSET, PTR) \ - DATA.s01 = vload2(OFFSET, PTR); +#define vload_partial_2(DATA, OFFSET, PTR) DATA.s01 = vload2(OFFSET, PTR); -#define vload_partial_3(DATA, OFFSET, PTR) \ - DATA.s012 = vload3(OFFSET, PTR); +#define vload_partial_3(DATA, OFFSET, PTR) DATA.s012 = vload3(OFFSET, PTR); -#define vload_partial_4(DATA, OFFSET, PTR) \ - DATA.s0123 = vload4(OFFSET, PTR); +#define vload_partial_4(DATA, OFFSET, PTR) DATA.s0123 = vload4(OFFSET, PTR); #define vload_partial_5(DATA, OFFSET, PTR) \ vload_partial_4(DATA.s0123, OFFSET, PTR); \ @@ -375,8 +371,7 @@ vload_partial_4(DATA.s0123, OFFSET, PTR); \ vload_partial_3(DATA.s456, OFFSET, PTR + 4); -#define vload_partial_8(DATA, OFFSET, PTR) \ - DATA.s01234567 = vload8(OFFSET, PTR); +#define vload_partial_8(DATA, OFFSET, PTR) DATA.s01234567 = vload8(OFFSET, PTR); #define vload_partial_9(DATA, OFFSET, PTR) \ vload_partial_8(DATA.s01234567, OFFSET, PTR); \ @@ -406,13 +401,12 @@ vload_partial_8(DATA.s01234567, OFFSET, PTR); \ vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); -#define vload_partial_16(DATA, OFFSET, PTR) \ - DATA = vload16(OFFSET, PTR); +#define vload_partial_16(DATA, OFFSET, PTR) DATA = vload16(OFFSET, PTR); /** @} */ // end of groupd vload_partial_n /** @} */ // end of groupd VLOAD_PARTIAL -#define PIXEL_UNIT4 1 -#define PIXEL_UNIT8 2 +#define PIXEL_UNIT4 1 +#define PIXEL_UNIT8 2 #define PIXEL_UNIT16 4 /** Utility macro to convert a vector size in pixel unit. @@ -425,27 +419,45 @@ * @{ */ #define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size -#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) +#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) /** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT #define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); -#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); -#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); +#define read_image2d_floatx2(img, x_coord, y_coord) \ + (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); +#define read_image2d_floatx4(img, x_coord, y_coord) \ + (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), \ + read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); -#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); -#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); +#define read_image2d_halfx2(img, x_coord, y_coord) \ + (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); +#define read_image2d_halfx4(img, x_coord, y_coord) \ + (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), \ + read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); -#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); -#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); +#define write_image2d_floatx2(img, x_coord, y_coord, values) \ + (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); +#define write_image2d_floatx4(img, x_coord, y_coord, values) \ + (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), \ + write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), \ + write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); -#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); -#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); +#define write_image2d_halfx2(img, x_coord, y_coord, values) \ + (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); +#define write_image2d_halfx4(img, x_coord, y_coord, values) \ + (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), \ + write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), \ + write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) /** Utility macro to read a 2D OpenCL image object. @@ -462,7 +474,7 @@ * @{ */ #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) -#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) +#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) /** @} */ /** Utility macro to write a 2D OpenCL image object. @@ -478,26 +490,28 @@ * * @{ */ -#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) -#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) +#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) \ + write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) +#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) \ + WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) /** @} */ #define VSTORE_STR(size) vstore##size -#define VSTORE(size) VSTORE_STR(size) +#define VSTORE(size) VSTORE_STR(size) -#define float1 float -#define half1 half -#define char1 char -#define uchar1 uchar -#define short1 short +#define float1 float +#define half1 half +#define char1 char +#define uchar1 uchar +#define short1 short #define ushort1 ushort -#define int1 int -#define uint1 uint -#define long1 long -#define ulong1 ulong +#define int1 int +#define uint1 uint +#define long1 long +#define ulong1 ulong #define double1 double -#define vload1(OFFSET, PTR) *(OFFSET + PTR) +#define vload1(OFFSET, PTR) *(OFFSET + PTR) #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA /** Extended partial vstore that correctly handles scalar values as well. @@ -516,23 +530,23 @@ * @{ */ #define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size -#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) +#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) #define NO_STORE(data, offs, ptr) \ { \ } // Size == 1 (scalar) -#define vstore_partial_1_0 NO_STORE -#define vstore_partial_1_1 vstore1 -#define vstore_partial_1_2 NO_STORE -#define vstore_partial_1_3 NO_STORE -#define vstore_partial_1_4 NO_STORE -#define vstore_partial_1_5 NO_STORE -#define vstore_partial_1_6 NO_STORE -#define vstore_partial_1_7 NO_STORE -#define vstore_partial_1_8 NO_STORE -#define vstore_partial_1_9 NO_STORE +#define vstore_partial_1_0 NO_STORE +#define vstore_partial_1_1 vstore1 +#define vstore_partial_1_2 NO_STORE +#define vstore_partial_1_3 NO_STORE +#define vstore_partial_1_4 NO_STORE +#define vstore_partial_1_5 NO_STORE +#define vstore_partial_1_6 NO_STORE +#define vstore_partial_1_7 NO_STORE +#define vstore_partial_1_8 NO_STORE +#define vstore_partial_1_9 NO_STORE #define vstore_partial_1_10 NO_STORE #define vstore_partial_1_11 NO_STORE #define vstore_partial_1_12 NO_STORE @@ -541,16 +555,16 @@ #define vstore_partial_1_15 NO_STORE #define vstore_partial_1_16 NO_STORE // Size == 2 -#define vstore_partial_2_0 NO_STORE -#define vstore_partial_2_1 vstore_partial_1 -#define vstore_partial_2_2 vstore_partial_2 -#define vstore_partial_2_3 NO_STORE -#define vstore_partial_2_4 NO_STORE -#define vstore_partial_2_5 NO_STORE -#define vstore_partial_2_6 NO_STORE -#define vstore_partial_2_7 NO_STORE -#define vstore_partial_2_8 NO_STORE -#define vstore_partial_2_9 NO_STORE +#define vstore_partial_2_0 NO_STORE +#define vstore_partial_2_1 vstore_partial_1 +#define vstore_partial_2_2 vstore_partial_2 +#define vstore_partial_2_3 NO_STORE +#define vstore_partial_2_4 NO_STORE +#define vstore_partial_2_5 NO_STORE +#define vstore_partial_2_6 NO_STORE +#define vstore_partial_2_7 NO_STORE +#define vstore_partial_2_8 NO_STORE +#define vstore_partial_2_9 NO_STORE #define vstore_partial_2_10 NO_STORE #define vstore_partial_2_11 NO_STORE #define vstore_partial_2_12 NO_STORE @@ -559,16 +573,16 @@ #define vstore_partial_2_15 NO_STORE #define vstore_partial_2_16 NO_STORE // Size == 3 -#define vstore_partial_3_0 NO_STORE -#define vstore_partial_3_1 vstore_partial_1 -#define vstore_partial_3_2 vstore_partial_2 -#define vstore_partial_3_3 vstore_partial_3 -#define vstore_partial_3_4 NO_STORE -#define vstore_partial_3_5 NO_STORE -#define vstore_partial_3_6 NO_STORE -#define vstore_partial_3_7 NO_STORE -#define vstore_partial_3_8 NO_STORE -#define vstore_partial_3_9 NO_STORE +#define vstore_partial_3_0 NO_STORE +#define vstore_partial_3_1 vstore_partial_1 +#define vstore_partial_3_2 vstore_partial_2 +#define vstore_partial_3_3 vstore_partial_3 +#define vstore_partial_3_4 NO_STORE +#define vstore_partial_3_5 NO_STORE +#define vstore_partial_3_6 NO_STORE +#define vstore_partial_3_7 NO_STORE +#define vstore_partial_3_8 NO_STORE +#define vstore_partial_3_9 NO_STORE #define vstore_partial_3_10 NO_STORE #define vstore_partial_3_11 NO_STORE #define vstore_partial_3_12 NO_STORE @@ -577,16 +591,16 @@ #define vstore_partial_3_15 NO_STORE #define vstore_partial_3_16 NO_STORE // Size == 4 -#define vstore_partial_4_0 NO_STORE -#define vstore_partial_4_1 vstore_partial_1 -#define vstore_partial_4_2 vstore_partial_2 -#define vstore_partial_4_3 vstore_partial_3 -#define vstore_partial_4_4 vstore_partial_4 -#define vstore_partial_4_5 NO_STORE -#define vstore_partial_4_6 NO_STORE -#define vstore_partial_4_7 NO_STORE -#define vstore_partial_4_8 NO_STORE -#define vstore_partial_4_9 NO_STORE +#define vstore_partial_4_0 NO_STORE +#define vstore_partial_4_1 vstore_partial_1 +#define vstore_partial_4_2 vstore_partial_2 +#define vstore_partial_4_3 vstore_partial_3 +#define vstore_partial_4_4 vstore_partial_4 +#define vstore_partial_4_5 NO_STORE +#define vstore_partial_4_6 NO_STORE +#define vstore_partial_4_7 NO_STORE +#define vstore_partial_4_8 NO_STORE +#define vstore_partial_4_9 NO_STORE #define vstore_partial_4_10 NO_STORE #define vstore_partial_4_11 NO_STORE #define vstore_partial_4_12 NO_STORE @@ -595,16 +609,16 @@ #define vstore_partial_4_15 NO_STORE #define vstore_partial_4_16 NO_STORE // Size == 8 -#define vstore_partial_8_0 NO_STORE -#define vstore_partial_8_1 vstore_partial_1 -#define vstore_partial_8_2 vstore_partial_2 -#define vstore_partial_8_3 vstore_partial_3 -#define vstore_partial_8_4 vstore_partial_4 -#define vstore_partial_8_5 vstore_partial_5 -#define vstore_partial_8_6 vstore_partial_6 -#define vstore_partial_8_7 vstore_partial_7 -#define vstore_partial_8_8 vstore_partial_8 -#define vstore_partial_8_9 NO_STORE +#define vstore_partial_8_0 NO_STORE +#define vstore_partial_8_1 vstore_partial_1 +#define vstore_partial_8_2 vstore_partial_2 +#define vstore_partial_8_3 vstore_partial_3 +#define vstore_partial_8_4 vstore_partial_4 +#define vstore_partial_8_5 vstore_partial_5 +#define vstore_partial_8_6 vstore_partial_6 +#define vstore_partial_8_7 vstore_partial_7 +#define vstore_partial_8_8 vstore_partial_8 +#define vstore_partial_8_9 NO_STORE #define vstore_partial_8_10 NO_STORE #define vstore_partial_8_11 NO_STORE #define vstore_partial_8_12 NO_STORE @@ -613,16 +627,16 @@ #define vstore_partial_8_15 NO_STORE #define vstore_partial_8_16 NO_STORE // Size == 16 -#define vstore_partial_16_0 NO_STORE -#define vstore_partial_16_1 vstore_partial_1 -#define vstore_partial_16_2 vstore_partial_2 -#define vstore_partial_16_3 vstore_partial_3 -#define vstore_partial_16_4 vstore_partial_4 -#define vstore_partial_16_5 vstore_partial_5 -#define vstore_partial_16_6 vstore_partial_6 -#define vstore_partial_16_7 vstore_partial_7 -#define vstore_partial_16_8 vstore_partial_8 -#define vstore_partial_16_9 vstore_partial_9 +#define vstore_partial_16_0 NO_STORE +#define vstore_partial_16_1 vstore_partial_1 +#define vstore_partial_16_2 vstore_partial_2 +#define vstore_partial_16_3 vstore_partial_3 +#define vstore_partial_16_4 vstore_partial_4 +#define vstore_partial_16_5 vstore_partial_5 +#define vstore_partial_16_6 vstore_partial_6 +#define vstore_partial_16_7 vstore_partial_7 +#define vstore_partial_16_8 vstore_partial_8 +#define vstore_partial_16_9 vstore_partial_9 #define vstore_partial_16_10 vstore_partial_10 #define vstore_partial_16_11 vstore_partial_11 #define vstore_partial_16_12 vstore_partial_12 @@ -648,17 +662,13 @@ * @param[in] PTR The base pointer * @{ */ -#define vstore_partial_1(DATA, OFFSET, PTR) \ - vstore1(DATA.s0, OFFSET, PTR); +#define vstore_partial_1(DATA, OFFSET, PTR) vstore1(DATA.s0, OFFSET, PTR); -#define vstore_partial_2(DATA, OFFSET, PTR) \ - vstore2(DATA.s01, OFFSET, PTR); +#define vstore_partial_2(DATA, OFFSET, PTR) vstore2(DATA.s01, OFFSET, PTR); -#define vstore_partial_3(DATA, OFFSET, PTR) \ - vstore3(DATA.s012, OFFSET, PTR); +#define vstore_partial_3(DATA, OFFSET, PTR) vstore3(DATA.s012, OFFSET, PTR); -#define vstore_partial_4(DATA, OFFSET, PTR) \ - vstore4(DATA.s0123, OFFSET, PTR); +#define vstore_partial_4(DATA, OFFSET, PTR) vstore4(DATA.s0123, OFFSET, PTR); #define vstore_partial_5(DATA, OFFSET, PTR) \ vstore_partial_4(DATA.s0123, OFFSET, PTR); \ @@ -672,8 +682,7 @@ vstore_partial_4(DATA.s0123, OFFSET, PTR); \ vstore_partial_3(DATA.s456, OFFSET, PTR + 4); -#define vstore_partial_8(DATA, OFFSET, PTR) \ - vstore8(DATA.s01234567, OFFSET, PTR); +#define vstore_partial_8(DATA, OFFSET, PTR) vstore8(DATA.s01234567, OFFSET, PTR); #define vstore_partial_9(DATA, OFFSET, PTR) \ vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ @@ -703,186 +712,156 @@ vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); -#define vstore_partial_16(DATA, OFFSET, PTR) \ - vstore16(DATA, OFFSET, PTR); +#define vstore_partial_16(DATA, OFFSET, PTR) vstore16(DATA, OFFSET, PTR); /** @} */ // end of groupd vstore_partial_n /** @} */ // end of groupd VSTORE_PARTIAL // Convert built-in functions with _sat modifier are not supported in floating point so we create defines // without _sat to overcome this issue -#define convert_float_sat convert_float -#define convert_float1_sat convert_float -#define convert_float2_sat convert_float2 -#define convert_float3_sat convert_float3 -#define convert_float4_sat convert_float4 -#define convert_float8_sat convert_float8 +#define convert_float_sat convert_float +#define convert_float1_sat convert_float +#define convert_float2_sat convert_float2 +#define convert_float3_sat convert_float3 +#define convert_float4_sat convert_float4 +#define convert_float8_sat convert_float8 #define convert_float16_sat convert_float16 -#define convert_half_sat convert_float -#define convert_half1_sat convert_half -#define convert_half2_sat convert_half2 -#define convert_half3_sat convert_half3 -#define convert_half4_sat convert_half4 -#define convert_half8_sat convert_half8 -#define convert_half16_sat convert_half16 - -#define convert_float1 convert_float -#define convert_half1 convert_half -#define convert_char1 convert_char -#define convert_uchar1 convert_uchar -#define convert_short1 convert_short +#define convert_half_sat convert_float +#define convert_half1_sat convert_half +#define convert_half2_sat convert_half2 +#define convert_half3_sat convert_half3 +#define convert_half4_sat convert_half4 +#define convert_half8_sat convert_half8 +#define convert_half16_sat convert_half16 + +#define convert_float1 convert_float +#define convert_half1 convert_half +#define convert_char1 convert_char +#define convert_uchar1 convert_uchar +#define convert_short1 convert_short #define convert_ushort1 convert_ushort -#define convert_int1 convert_int -#define convert_uint1 convert_uint -#define convert_long1 convert_long -#define convert_ulong1 convert_ulong +#define convert_int1 convert_int +#define convert_uint1 convert_uint +#define convert_long1 convert_long +#define convert_ulong1 convert_ulong #define convert_double1 convert_double -#define convert_char1_sat convert_char_sat -#define convert_uchar1_sat convert_uchar_sat -#define convert_uchar2_sat convert_uchar2_sat -#define convert_uchar3_sat convert_uchar3_sat -#define convert_uchar4_sat convert_uchar4_sat -#define convert_uchar8_sat convert_uchar8_sat +#define convert_char1_sat convert_char_sat +#define convert_uchar1_sat convert_uchar_sat +#define convert_uchar2_sat convert_uchar2_sat +#define convert_uchar3_sat convert_uchar3_sat +#define convert_uchar4_sat convert_uchar4_sat +#define convert_uchar8_sat convert_uchar8_sat #define convert_uchar16_sat convert_uchar16_sat -#define convert_short1_sat convert_short_sat +#define convert_short1_sat convert_short_sat #define convert_ushort1_sat convert_ushort_sat -#define convert_int1_sat convert_int_sat -#define convert_uint1_sat convert_uint_sat -#define convert_long1_sat convert_long_sat -#define convert_ulong1_sat convert_ulong_sat +#define convert_int1_sat convert_int_sat +#define convert_uint1_sat convert_uint_sat +#define convert_long1_sat convert_long_sat +#define convert_ulong1_sat convert_ulong_sat #define convert_double1_sat convert_double_sat #define VEC_DATA_TYPE_STR(type, size) type##size -#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) #define CONVERT_STR(x, type) (convert_##type((x))) -#define CONVERT(x, type) CONVERT_STR(x, type) +#define CONVERT(x, type) CONVERT_STR(x, type) #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) -#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) +#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) -#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) +#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) -#define select_vec_dt_uchar(size) uchar##size -#define select_vec_dt_char(size) char##size +#define select_vec_dt_uchar(size) uchar##size +#define select_vec_dt_char(size) char##size #define select_vec_dt_ushort(size) ushort##size -#define select_vec_dt_short(size) short##size -#define select_vec_dt_half(size) short##size -#define select_vec_dt_uint(size) uint##size -#define select_vec_dt_int(size) int##size -#define select_vec_dt_float(size) int##size -#define select_vec_dt_ulong(size) ulong##size -#define select_vec_dt_long(size) long##size +#define select_vec_dt_short(size) short##size +#define select_vec_dt_half(size) short##size +#define select_vec_dt_uint(size) uint##size +#define select_vec_dt_int(size) int##size +#define select_vec_dt_float(size) int##size +#define select_vec_dt_ulong(size) ulong##size +#define select_vec_dt_long(size) long##size #define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) -#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) -#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) +#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) +#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) -#define signed_int_vec_dt_uchar(size) char##size -#define signed_int_vec_dt_char(size) char##size +#define signed_int_vec_dt_uchar(size) char##size +#define signed_int_vec_dt_char(size) char##size #define signed_int_vec_dt_ushort(size) short##size -#define signed_int_vec_dt_short(size) short##size -#define signed_int_vec_dt_half(size) short##size -#define signed_int_vec_dt_uint(size) int##size -#define signed_int_vec_dt_int(size) int##size -#define signed_int_vec_dt_float(size) int##size -#define signed_int_vec_dt_ulong(size) long##size -#define signed_int_vec_dt_long(size) long##size +#define signed_int_vec_dt_short(size) short##size +#define signed_int_vec_dt_half(size) short##size +#define signed_int_vec_dt_uint(size) int##size +#define signed_int_vec_dt_int(size) int##size +#define signed_int_vec_dt_float(size) int##size +#define signed_int_vec_dt_ulong(size) long##size +#define signed_int_vec_dt_long(size) long##size #define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) -#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) -#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) - -#define sum_reduce_1(x) (x) -#define sum_reduce_2(x) ((x).s0) + ((x).s1) -#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) -#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) -#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) +#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) +#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) + +#define sum_reduce_1(x) (x) +#define sum_reduce_2(x) ((x).s0) + ((x).s1) +#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) +#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) +#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) #define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) #define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) -#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) +#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) -#define prod_reduce_1(x) (x) -#define prod_reduce_2(x) ((x).s0) * ((x).s1) -#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) -#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) -#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) +#define prod_reduce_1(x) (x) +#define prod_reduce_2(x) ((x).s0) * ((x).s1) +#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) +#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) +#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) #define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) #define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) -#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) +#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) -#define max_reduce_1(x) (x) -#define max_reduce_2(x) max(((x).s0), ((x).s1)) -#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) -#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) -#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) +#define max_reduce_1(x) (x) +#define max_reduce_2(x) max(((x).s0), ((x).s1)) +#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) +#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) +#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) #define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) #define MAX_REDUCE_STR(x, size) max_reduce_##size(x) -#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) +#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) -#define min_reduce_1(x) (x) -#define min_reduce_2(x) min(((x).s0), ((x).s1)) -#define min_reduce_3(x) min(min_reduce_2((x).s01), ((x).s2)) -#define min_reduce_4(x) min(min_reduce_2((x).s01), min_reduce_2((x).s23)) -#define min_reduce_8(x) min(min_reduce_4((x).s0123), min_reduce_4((x).s4567)) +#define min_reduce_1(x) (x) +#define min_reduce_2(x) min(((x).s0), ((x).s1)) +#define min_reduce_3(x) min(min_reduce_2((x).s01), ((x).s2)) +#define min_reduce_4(x) min(min_reduce_2((x).s01), min_reduce_2((x).s23)) +#define min_reduce_8(x) min(min_reduce_4((x).s0123), min_reduce_4((x).s4567)) #define min_reduce_16(x) min(min_reduce_8((x).s01234567), min_reduce_8((x).s89ABCDEF)) #define MIN_REDUCE_STR(x, size) min_reduce_##size(x) -#define MIN_REDUCE(x, size) MIN_REDUCE_STR(x, size) - -#define VECTOR_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_offset_first_element_in_bytes - -#define IMAGE_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_offset_first_element_in_bytes - -#define TENSOR3D_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_stride_z, \ - uint name##_step_z, \ - uint name##_offset_first_element_in_bytes - -#define TENSOR4D_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_stride_z, \ - uint name##_step_z, \ - uint name##_stride_w, \ - uint name##_step_w, \ - uint name##_offset_first_element_in_bytes - -#define TENSOR5D_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_stride_z, \ - uint name##_step_z, \ - uint name##_stride_w, \ - uint name##_step_w, \ - uint name##_stride_v, \ - uint name##_step_v, \ - uint name##_offset_first_element_in_bytes +#define MIN_REDUCE(x, size) MIN_REDUCE_STR(x, size) + +#define VECTOR_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_offset_first_element_in_bytes + +#define IMAGE_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR3D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_stride_z, uint name##_step_z, uint name##_offset_first_element_in_bytes + +#define TENSOR4D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR5D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, uint name##_stride_v, \ + uint name##_step_v, uint name##_offset_first_element_in_bytes #define CONVERT_TO_VECTOR_STRUCT(name) \ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) @@ -890,38 +869,47 @@ #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) -#define CONVERT_TO_IMAGE_STRUCT(name) \ - update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y) #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z) -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0, name##_stride_z, name##_step_z) -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z) -#define CONVERT_TO_TENSOR3D_STRUCT(name) \ - update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ - name##_stride_z, name##_step_z) +#define CONVERT_TO_TENSOR3D_STRUCT(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) -#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ - update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) +#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0, name##_stride_z, 0) -#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ - update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ - name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) +#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y, name##_stride_z, name##_step_z, name##_stride_w, \ + name##_step_w, mod_size) -#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ - update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) +#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) -#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ - tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ - name##_stride_z, name##_step_z) +#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ + tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) /** Structure to hold Vector information */ typedef struct Vector @@ -970,10 +958,10 @@ typedef struct Tensor4D * * @return An image object */ -inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) +inline Vector +update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) { - Vector vector = - { + Vector vector = { .ptr = ptr, .offset_first_element_in_bytes = offset_first_element_in_bytes, .stride_x = stride_x, @@ -993,15 +981,13 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_ * * @return An image object */ -inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) +inline Image update_image_workitem_ptr( + __global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) { - Image img = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y - }; + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; return img; } @@ -1019,16 +1005,21 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el * * @return A 3D tensor object */ -inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z) { - Image img = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y - }; - img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + + get_global_id(2) * step_z; return img; } @@ -1045,17 +1036,22 @@ inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint o * * @return A 3D tensor object */ -inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z) { - Tensor3D tensor = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z - }; - tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; + Tensor3D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z}; + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + + get_global_id(2) * step_z; return tensor; } @@ -1072,34 +1068,44 @@ inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_fi * * @return A 3D tensor object */ -inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z) { - Tensor3D tensor = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z - }; + Tensor3D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z}; return tensor; } -inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, - uint step_w, - uint mod_size) +inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z, + uint stride_w, + uint step_w, + uint mod_size) { - Tensor4D tensor = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z, - .stride_w = stride_w - }; - - tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; + Tensor4D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z, + .stride_w = stride_w}; + + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; return tensor; } @@ -1171,7 +1177,8 @@ inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint wid const uint x = index; - return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + + tensor->offset_first_element_in_bytes; } #endif // _HELPER_H diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h index 562c5d3236..166260a3c0 100644 --- a/src/core/CL/cl_kernels/helpers_asymm.h +++ b/src/core/CL/cl_kernels/helpers_asymm.h @@ -34,7 +34,7 @@ * @return The converted vector */ #define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) -#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) +#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) /** Quantize a floating-point scalar value to 8-bit asymmetric * @@ -84,14 +84,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return quantized values */ -#define QUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ - { \ - VEC_DATA_TYPE(float, size) \ - out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ - VEC_DATA_TYPE(type, size) \ - res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ - return res; \ +#define QUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(type, size) \ + quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ + { \ + VEC_DATA_TYPE(float, size) \ + out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ + VEC_DATA_TYPE(type, size) \ + res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ + return res; \ } /** Dequantize a vector of values to floating-point @@ -101,10 +102,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return dequantized values in floating point */ -#define DEQUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ - { \ - return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ +#define DEQUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(float, size) \ + dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ + { \ + return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ } /** Correctly-rounded-to-nearest division by a power-of-two. @@ -113,18 +115,17 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Correctly-rounded-to-nearest division by a power-of-two. */ -#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ - { \ - const VEC_DATA_TYPE(int, size) \ - zero = (VEC_DATA_TYPE(int, size))0; \ - const VEC_DATA_TYPE(int, size) \ - one = (VEC_DATA_TYPE(int, size))1; \ - VEC_DATA_TYPE(int, size) \ - mask = (one << exponent) - one; \ - VEC_DATA_TYPE(int, size) \ - threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ - return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + { \ + const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ + const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ + VEC_DATA_TYPE(int, size) \ + mask = (one << exponent) - one; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ + return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ } /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), @@ -167,27 +168,29 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Result in fixed-point format Q0. */ -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \ - { \ - const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ - const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ - const int k_fractional_bits = 31; \ - VEC_DATA_TYPE(int, size) \ - x = a + (1 << (k_fractional_bits - 3)); \ - VEC_DATA_TYPE(int, size) \ - x2 = ASYMM_MULT(x, x, size); \ - VEC_DATA_TYPE(int, size) \ - x3 = ASYMM_MULT(x2, x, size); \ - VEC_DATA_TYPE(int, size) \ - x4 = ASYMM_MULT(x2, x2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ - return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ + const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ + const int k_fractional_bits = 31; \ + VEC_DATA_TYPE(int, size) \ + x = a + (1 << (k_fractional_bits - 3)); \ + VEC_DATA_TYPE(int, size) \ + x2 = ASYMM_MULT(x, x, size); \ + VEC_DATA_TYPE(int, size) \ + x3 = ASYMM_MULT(x2, x, size); \ + VEC_DATA_TYPE(int, size) \ + x4 = ASYMM_MULT(x2, x2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ + return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ } /** Each bit of the result is set to the corresponding bit of either then_val or @@ -198,10 +201,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not. */ -#define ASYMM_SELECT_USING_MASK_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \ - { \ - return (if_mask & then_val) ^ (~if_mask & else_val); \ +#define ASYMM_SELECT_USING_MASK_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size( \ + VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \ + { \ + return (if_mask & then_val) ^ (~if_mask & else_val); \ } /** For each element of input vector, the corresponding bits of the result item are set @@ -234,18 +238,19 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0)); \ } -#define EXP_BARREL_SHIFTER_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ - { \ - if(k_integer_bits > exponent) \ - { \ - const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ - return ASYMM_SELECT_USING_MASK( \ - ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ - ASYMM_MULT(result, fp_multiplier, size), result, size); \ - } \ - \ - return result; \ +#define EXP_BARREL_SHIFTER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ + int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ + { \ + if (k_integer_bits > exponent) \ + { \ + const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ + return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ + ASYMM_MULT(result, fp_multiplier, size), result, size); \ + } \ + \ + return result; \ } /** Calculates \f$ exp(x) \f$ for x < 0. @@ -254,39 +259,40 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Result in fixed-point format Q0. */ -#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ - { \ - const int k_fractional_bits = 31 - k_integer_bits; \ - VEC_DATA_TYPE(int, size) \ - k_one_quarter = 1 << (k_fractional_bits - 2); \ - VEC_DATA_TYPE(int, size) \ - mask = k_one_quarter - 1; \ - VEC_DATA_TYPE(int, size) \ - a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ - VEC_DATA_TYPE(int, size) \ - a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ - VEC_DATA_TYPE(int, size) \ - result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \ - VEC_DATA_TYPE(int, size) \ - remainder = a_mod_quarter_minus_one_quarter - a; \ - \ - result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ - \ - if(k_integer_bits > 5) \ - { \ - const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ - result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ - return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ +#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ + { \ + const int k_fractional_bits = 31 - k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + k_one_quarter = 1 << (k_fractional_bits - 2); \ + VEC_DATA_TYPE(int, size) \ + mask = k_one_quarter - 1; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, \ + size); \ + VEC_DATA_TYPE(int, size) \ + remainder = a_mod_quarter_minus_one_quarter - a; \ + \ + result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ + \ + if (k_integer_bits > 5) \ + { \ + const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ + result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ } /** Calculates the product of a integer value by a power of two, with either a positive exponent @@ -297,26 +303,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Arithmetic left or right shift. */ -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - if(exponent < 0) \ - { \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) min = INT_MIN; \ - const VEC_DATA_TYPE(int, size) max = INT_MAX; \ - int threshold = ((1 << (31 - exponent)) - 1); \ - VEC_DATA_TYPE(int, size) \ - positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ - VEC_DATA_TYPE(int, size) \ - negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ - VEC_DATA_TYPE(int, size) \ - result = x << exponent; \ - result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ - result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ - return result; \ +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + if (exponent < 0) \ + { \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) min = INT_MIN; \ + const VEC_DATA_TYPE(int, size) max = INT_MAX; \ + int threshold = ((1 << (31 - exponent)) - 1); \ + VEC_DATA_TYPE(int, size) \ + positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ + VEC_DATA_TYPE(int, size) \ + negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ + VEC_DATA_TYPE(int, size) \ + result = x << exponent; \ + result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ + result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ + return result; \ } /** Calculates (a+b)/2, rounded to the nearest integer. @@ -326,20 +333,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return (a+b)/2, rounded to the nearest integer. */ -#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(long, size) \ - a64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - sum = a64 + b64; \ - const VEC_DATA_TYPE(long, size) one = 1; \ - const VEC_DATA_TYPE(long, size) minus_one = -1; \ - VEC_DATA_TYPE(long, size) \ - sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \ - return convert_int##size((sum + sign) / 2); \ +#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(long, size) \ + a64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + sum = a64 + b64; \ + const VEC_DATA_TYPE(long, size) one = 1; \ + const VEC_DATA_TYPE(long, size) minus_one = -1; \ + VEC_DATA_TYPE(long, size) \ + sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \ + return convert_int##size((sum + sign) / 2); \ } /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). @@ -354,12 +362,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ VEC_DATA_TYPE(int, size) \ - half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ + half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ VEC_DATA_TYPE(int, size) \ x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ - for(int i = 0; i < 3; i++) \ + for (int i = 0; i < 3; i++) \ { \ VEC_DATA_TYPE(int, size) \ half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ @@ -378,48 +386,57 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Rescaled value. */ -#define ASYMM_RESCALE_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \ - { \ - int exponent = src_integer_bits - dst_integer_bits; \ - return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ +#define ASYMM_RESCALE_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \ + { \ + int exponent = src_integer_bits - dst_integer_bits; \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ } -#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) -#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) +#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) +#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) #define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale) -#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size) +#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size) #define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent) -#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) -#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b) -#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size) +#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) +#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size) #define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) -#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val) -#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) +#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \ + asymm_select_using_mask##size(if_mask, then_val, else_val) +#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) #define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) -#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder) +#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) \ + exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder) #define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits) -#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) -#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a) -#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent) +#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a) +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \ + asymm_saturating_rounding_mult_by_pow2##size(x, exponent) #define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) -#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits) -#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) - -#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ - { \ - const int left_shift = shift > 0 ? shift : 0; \ - const int right_shift = shift > 0 ? 0 : -shift; \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ +#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) \ + asymm_rescale##size(value, src_integer_bits, dst_integer_bits) +#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ + ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) + +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ + { \ + const int left_shift = shift > 0 ? shift : 0; \ + const int right_shift = shift > 0 ? 0 : -shift; \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ } -#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift) +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \ + multiply_by_quantized_multiplier##size(input, qmul, shift) QUANTIZE_IMPL(uchar, 1) QUANTIZE_IMPL(char, 1) diff --git a/src/core/CL/cl_kernels/load_store_utility.h b/src/core/CL/cl_kernels/load_store_utility.h index 4ba2b2ca3a..4daf0adc89 100644 --- a/src/core/CL/cl_kernels/load_store_utility.h +++ b/src/core/CL/cl_kernels/load_store_utility.h @@ -223,8 +223,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** @} */ // end of group STORE_BLOCK /** Convert and store a block of the given size M0xN0 @@ -245,8 +247,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** @} */ // end of group CONVERT_STORE_BLOCK /** Partially store the 0 to (n-1)th rows of the given variables @@ -365,8 +369,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** Store a block that can be partial in both x and y dimensions * * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. @@ -388,22 +394,23 @@ * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. */ -#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ - { \ - STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ - { \ - STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ - { \ - STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else \ - { \ - STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ +#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ + { \ + STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ + { \ + STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ + { \ + STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else \ + { \ + STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } /** Store a block that can only be partial in x but not y. * @@ -425,7 +432,7 @@ * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. */ #define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X)) \ + if (!(PARTIAL_COND_X)) \ { \ STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } \ @@ -453,7 +460,7 @@ * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. */ #define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ - if(!(PARTIAL_COND_Y)) \ + if (!(PARTIAL_COND_Y)) \ { \ STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } \ @@ -517,23 +524,28 @@ #if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case1: No partial blocks in either x or y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 // Case2: Partial blocks in y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 // Case3: Partial blocks in x -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case4: Partial blocks in both x and y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ + STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 @@ -560,8 +572,7 @@ #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) #else // defined(PARTIAL_STORE_M0) -#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ - ((uint)(y * M0)) +#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) ((uint)(y * M0)) #endif // defined(PARTIAL_STORE_M0) /** @} */ // end of group COMPUTE_M0_START_ROW diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h index bed94a7b3b..cb2f4b0319 100644 --- a/src/core/CL/cl_kernels/repeat.h +++ b/src/core/CL/cl_kernels/repeat.h @@ -75,7 +75,9 @@ P_X##_DEF(F, P_A, P_B, P_C); \ REPEAT_3_15(P_X, P_A, P_B, P_C) -#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM +#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \ + REPEAT_3_##P_NUM(P_OP, P_A, P_B, \ + P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM #define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) // Repeat macros with 4 param, excluding the implicit ID param @@ -126,52 +128,59 @@ P_X##_DEF(F, P_A, P_B, P_C, P_D); \ REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) -#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM +#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \ + REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, \ + P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM #define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) // Macro for initializing N variables. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) -#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL +#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) // Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) -#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) +#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) #define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT) // Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) #define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) -#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) +#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \ + REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) // Macro for adding a constant to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL +#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL #define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL) // Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...) -#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL +#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL #define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL) // Macro for adding a vector to N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) #define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC -#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) +#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) // Macro for adding a two N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) #define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID -#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) +#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) // Macro for performing Max between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) +#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) #define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL) // Macro for performing Min between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) +#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) #define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL) // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) -#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) -#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) // Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables. #define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ @@ -182,6 +191,7 @@ VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \ }) -#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) #endif // ARM_COMPUTE_REPEAT_H diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h index 642483ab3c..6595bd1981 100644 --- a/src/core/CL/cl_kernels/warp_helpers.h +++ b/src/core/CL/cl_kernels/warp_helpers.h @@ -31,11 +31,13 @@ * @param[in] border_size Border size of the image * */ -inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size) +inline const float8 +clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size) { const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size); const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size); - return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3); + return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, + clamped_y.s3); } /** Clamps the given coordinates to the borders. @@ -74,7 +76,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int */ inline const float8 get_neighbour_coords(const float2 coord) { - return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1); + return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, + /*br*/ coord.s0 + 1, coord.s1 + 1); } /** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values @@ -85,37 +88,38 @@ inline const float8 get_neighbour_coords(const float2 coord) * @param[in] height Height of the image * @param[in] border_size Border size */ -inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size) +inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border( + const Image *in, const float8 coords, const float width, const float height, const float border_size) { // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image. // Sets the 4x4 coordinates for each of the four input texels const float8 fc = floor(coords); - const float16 c1 = (float16)( - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size), - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size)); - const float16 c2 = (float16)( - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size), - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size)); + const float16 c1 = + (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size), + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size)); + const float16 c2 = + (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size), + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size)); // Loads the values from the input image const float16 t = (float16)( - /* tl, tr, bl, br */ - * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)), - *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)), - *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)), - *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)), - *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)), - *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)), - *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)), - *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf))); - const float8 a = coords - fc; - const float8 b = ((float8)(1.f)) - a; - const float4 fr = (float4)( - ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)), - ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)), - ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)), - ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7))); + /* tl, tr, bl, br */ + *((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)), + *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)), + *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)), + *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)), + *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)), + *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)), + *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)), + *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf))); + const float8 a = coords - fc; + const float8 b = ((float8)(1.f)) - a; + const float4 fr = + (float4)(((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)), + ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)), + ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)), + ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7))); return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4)); } @@ -126,7 +130,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const * @param[in] width Width of the image * @param[in] height Height of the image */ -inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height) +inline const VEC_DATA_TYPE(DATA_TYPE, 4) + bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height) { return bilinear_interpolate_with_border(in, coords, width, height, 1); } diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp index 2728958add..5b72354abe 100644 --- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp +++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,16 +45,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64, DataType::U64); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64, + DataType::U64); } return Status{}; @@ -66,22 +71,34 @@ CLArgMinMaxLayerKernel::CLArgMinMaxLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op); } -void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape{ input->info()->tensor_shape() }; + TensorShape output_shape{input->info()->tensor_shape()}; output_shape.set(axis, 1); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(DataType::S32).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(DataType::S32) + .reset_padding() + .set_is_resizable(true)); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -90,11 +107,14 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, // Set build options const auto adjusted_vector_size = adjust_vec_size(16U, input->info()->dimension(0)); - const auto vector_size = (adjusted_vector_size == 3U && axis == 0U) ? 2U : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16. + const auto vector_size = (adjusted_vector_size == 3U && axis == 0U) + ? 2U + : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16. CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(input->info()->dimension(0) % vector_size)); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vector_size)); build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE"); build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN"); @@ -104,7 +124,7 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, // Create kernel std::string kernel_axis_name; - switch(axis) + switch (axis) { case 0: build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); @@ -135,7 +155,10 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); return Status{}; @@ -146,7 +169,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - switch(_reduction_axis) + switch (_reduction_axis) { case 0: { @@ -154,7 +177,8 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) Window out_window(window); Window in_window(window); out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); - in_window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); + in_window.set(Window::DimX, + Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); in_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1u)); // Get first input and output slices @@ -166,15 +190,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_2D_tensor_argument(idx, _input, in_slice); add_2D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice)); + } while (in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice)); } break; case 1: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); + Window window_in{window}; + window_in.set(Window::DimY, + Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); Window in_slice = window_in.first_slice_window_2D(); Window out_slice = window.first_slice_window_2D(); @@ -184,15 +208,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_2D_tensor_argument(idx, _input, in_slice); add_2D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); } break; case 2: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); + Window window_in{window}; + window_in.set(Window::DimZ, + Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); Window in_slice = window_in.first_slice_window_3D(); Window out_slice = window.first_slice_window_3D(); @@ -202,14 +226,13 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _input, in_slice); add_3D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); + } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); } break; case 3: { // Get first input and output slices - Window window_in{ window }; + Window window_in{window}; window_in.set(3, Window::Dimension(0, 1, 1)); Window in_slice = window_in.first_slice_window_4D(); Window out_slice = window.first_slice_window_4D(); @@ -220,8 +243,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_4D_tensor_argument(idx, _input, in_slice); add_4D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); + } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); } break; default: diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h index 5f36bdf113..fb3b41b0de 100644 --- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h +++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -72,7 +73,11 @@ public: * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op); /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel. * @@ -84,7 +89,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp index 3fa8a8edaa..c88a852a44 100644 --- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp @@ -23,58 +23,64 @@ */ #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" using namespace arm_compute; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_UNUSED(epsilon); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); - if(beta != nullptr) + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); + if (beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); } - if(gamma != nullptr) + if (gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); } - if(act_info.enabled()) + if (act_info.enabled()) { ActivationLayerInfo::ActivationFunction act = act_info.activation(); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU && + act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + act != + ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a()); } - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -86,14 +92,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, std::pair validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output) { - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(16 / input->element_size(), input->dimension(0)); // Configure kernel window Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); bool window_changed = false; - if(output != nullptr) + if (output != nullptr) { AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); window_changed = update_window_and_padding(win, input_access, output_access); @@ -104,30 +111,50 @@ std::pair validate_and_configure_window_nchw(ITensorInfo *input, window_changed = update_window_and_padding(win, input_access); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel() - : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false) + : _input(nullptr), + _output(nullptr), + _mean(nullptr), + _var(nullptr), + _beta(nullptr), + _gamma(nullptr), + _epsilon(0), + _run_in_place(false) { _type = CLKernelType::ELEMENTWISE; } -void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, - float epsilon, ActivationLayerInfo act_info) +void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info); } -void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, - const ICLTensor *gamma, - float epsilon, ActivationLayerInfo act_info) +void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var); - auto padding_info = get_padding_info({ input, output, mean, var, beta, gamma }); + auto padding_info = get_padding_info({input, output, mean, var, beta, gamma}); _input = input; _output = output; _mean = mean; @@ -142,13 +169,15 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_ mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr, (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info)); - unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); + unsigned int num_elems_processed_per_iteration = + adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); // Set build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); @@ -157,29 +186,33 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_ build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA"); // Create kernel - _kernel = create_kernel(compile_context, "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = + create_kernel(compile_context, + "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Set kernel static arguments unsigned int include_output = (!_run_in_place) ? 1 : 0; - unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters - if(_beta != nullptr) + unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() + + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters + if (_beta != nullptr) { idx += num_arguments_per_1D_tensor(); // Skip beta parameter } - if(_gamma != nullptr) + if (_gamma != nullptr) { idx += num_arguments_per_1D_tensor(); // Skip gamma parameter } _kernel.setArg(idx++, _epsilon); - if(output != nullptr) + if (output != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); } // Configure kernel window - if(input->info()->data_layout() == DataLayout::NHWC) + if (input->info()->data_layout() == DataLayout::NHWC) { Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); @@ -205,18 +238,23 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_ _config_id += lower_string(string_from_data_layout(input->info()->data_layout())); } -Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { const bool run_in_place = (output == nullptr) || (output == input); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info)); - if(input->data_layout() != DataLayout::NHWC) + if (input->data_layout() != DataLayout::NHWC) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()) + .first); } return Status{}; @@ -236,11 +274,11 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor(); add_1D_tensor_argument(idx, _mean, vector_slice); add_1D_tensor_argument(idx, _var, vector_slice); - if(_beta != nullptr) + if (_beta != nullptr) { add_1D_tensor_argument(idx, _beta, vector_slice); } - if(_gamma != nullptr) + if (_gamma != nullptr) { add_1D_tensor_argument(idx, _gamma, vector_slice); } @@ -249,11 +287,10 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue { idx = 0; add_3D_tensor_argument(idx, _input, slice); - if(!_run_in_place) + if (!_run_in_place) { add_3D_tensor_argument(idx, _output, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h index acbe0f2a26..1a88d2a8c5 100644 --- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h +++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -64,7 +65,13 @@ public: * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. */ - void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f, + void configure(ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta = nullptr, + const ICLTensor *gamma = nullptr, + float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); /** Set the input and output tensors. * @@ -82,8 +89,15 @@ public: * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, - const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta = nullptr, + const ICLTensor *gamma = nullptr, + float epsilon = 0.001f, + ActivationLayerInfo act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result. @@ -99,10 +113,14 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr, - float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta = nullptr, + const ITensorInfo *gamma = nullptr, + float epsilon = 0.001f, + ActivationLayerInfo act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp index 143a842d02..c640b5a8d6 100644 --- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp +++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp @@ -25,13 +25,14 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" -#include "arm_compute/core/TensorInfo.h" using namespace arm_compute::misc::shape_calculator; namespace arm_compute @@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status validate_arguments_static(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); @@ -66,10 +71,11 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); - const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = compute_batch_to_space_shape( + input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); + const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -79,8 +85,7 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape } } // namespace -CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() - : _input(nullptr), _block_shape(nullptr), _output(nullptr) +CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() : _input(nullptr), _block_shape(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -90,11 +95,14 @@ void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTenso configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); } -void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) +void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, block_shape, output }); + auto padding_info = get_padding_info({input, block_shape, output}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info())); @@ -106,8 +114,9 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3))); - _kernel = create_kernel(compile_context, "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); - + _kernel = create_kernel(compile_context, + "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -116,47 +125,65 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) +void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info); } -void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, - const CropInfo &crop_info) +void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); + const TensorShape output_shape = compute_batch_to_space_shape( + input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); _input = input; _output = output; // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3))); build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x)); build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y)); build_opts.add_option("-DCROP_LEFT=" + support::cpp11::to_string(crop_info.left)); build_opts.add_option("-DCROP_TOP=" + support::cpp11::to_string(crop_info.top)); - _kernel = create_kernel(compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel( + compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); ICLKernel::configure_internal(win); } -Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output)); return Status{}; } -Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info)); @@ -185,7 +212,7 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu unsigned int idx = 0; add_4D_tensor_argument(idx, _input, slice_in); add_argument(idx, batch_id); - if(_block_shape != nullptr) + if (_block_shape != nullptr) { add_1D_tensor_argument(idx, _block_shape, vector_slice); } @@ -193,7 +220,6 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu enqueue(queue, *this, slice_out, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h index a05184cd5b..b9d3e66fe2 100644 --- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h +++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -65,7 +66,10 @@ public: * * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + ICLTensor *output); /** Initialise the kernel's inputs and output (Static block shape). * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -74,7 +78,11 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed */ - void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info); + void configure(const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info); /** Initialise the kernel's inputs and output (Static block shape). * * @param[in] compile_context The compile context to be used. @@ -84,7 +92,12 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -106,7 +119,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info); + static Status validate(const ITensorInfo *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp index 11e6d021a5..de3fb43de8 100644 --- a/src/core/CL/kernels/CLBitwiseKernel.cpp +++ b/src/core/CL/kernels/CLBitwiseKernel.cpp @@ -28,25 +28,29 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" namespace arm_compute { -CLBitwiseKernel::CLBitwiseKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +CLBitwiseKernel::CLBitwiseKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op) +void CLBitwiseKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + BitwiseOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); - if(op != BitwiseOperation::NOT) + if (op != BitwiseOperation::NOT) { ARM_COMPUTE_ERROR_ON_NULLPTR(input2); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); @@ -56,7 +60,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I // Output auto inizialitation if not yet initialized auto_init_if_empty(*(output->info()), *(input1->info())); - auto padding_info = get_padding_info({ input1, input2, output }); + auto padding_info = get_padding_info({input1, input2, output}); // Configure kernel window const unsigned int vec_size_x = adjust_vec_size(16 / output->info()->element_size(), output->info()->dimension(0)); @@ -68,7 +72,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I // Create kernel std::string kernel_name = ""; - switch(op) + switch (op) { case BitwiseOperation::AND: kernel_name = "bitwise_and"; @@ -107,13 +111,12 @@ void CLBitwiseKernel::run(const Window &window, cl::CommandQueue &queue) { unsigned int idx = 0; add_2D_tensor_argument(idx, _input1, slice); - if(_input2 != nullptr) + if (_input2 != nullptr) { add_2D_tensor_argument(idx, _input2, slice); } add_2D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLBitwiseKernel.h b/src/core/CL/kernels/CLBitwiseKernel.h index c5a999643d..2c74955ae4 100644 --- a/src/core/CL/kernels/CLBitwiseKernel.h +++ b/src/core/CL/kernels/CLBitwiseKernel.h @@ -59,7 +59,11 @@ public: * @param[out] output Destination tensor. Data types supported: U8. * @param[in] op Bitwise operation to perform. Supported: AND, OR, NOT, XOR. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + BitwiseOperation op); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp index 72de854afb..f32c518e29 100644 --- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp +++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -40,7 +41,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status validate_arguments(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes); @@ -53,7 +57,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2); const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16; - if(is_qasymm16) + if (is_qasymm16) { const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f); @@ -65,12 +69,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas); } - if(pred_boxes->total_size() > 0) + if (pred_boxes->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes); ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2); - if(is_qasymm16) + if (is_qasymm16) { const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f); @@ -83,22 +87,31 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe } } // namespace -CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() - : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr) +CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info); } -void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); - auto padding_info = get_padding_info({ boxes, pred_boxes, deltas }); - auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info())); + auto padding_info = get_padding_info({boxes, pred_boxes, deltas}); + auto_init_if_empty(*pred_boxes->info(), deltas->info() + ->clone() + ->set_data_type(boxes->info()->data_type()) + .set_quantization_info(boxes->info()->quantization_info())); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info)); @@ -128,7 +141,7 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale())); build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1"); - if(is_quantized) + if (is_quantized) { build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type())); const UniformQuantizationInfo boxes_qinfo = boxes->info()->quantization_info().uniform(); @@ -148,12 +161,15 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con // Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor const unsigned int num_elems_processed_per_iteration = 4; - Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info)); return Status{}; diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h index 08f350e86a..9a1bb49bb9 100644 --- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h +++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h @@ -58,7 +58,10 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. * */ - void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info); + void configure(const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -71,7 +74,11 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform * @@ -85,7 +92,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info); + static Status validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp index a2a0bc4fb4..ec58bf9e7a 100644 --- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp +++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -46,15 +47,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient"); - const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); + const unsigned int channels = + input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + num_groups == channels, + "Channel shuffling with same number of groups as number of channels would be inefficient"); // There cannot be more groups than channels ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, + "The number of channels must be a multiple of the number of groups"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -70,11 +75,12 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen auto_init_if_empty(*output, *input->clone()); const bool is_nhwc = input->data_layout() == DataLayout::NHWC; - if(is_nhwc) + if (is_nhwc) { - unsigned int num_elems_processed_per_iteration_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x)); - Window win_collapsed = win.collapse(win, Window::DimZ); + unsigned int num_elems_processed_per_iteration_x = + adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x)); + Window win_collapsed = win.collapse(win, Window::DimZ); return std::make_pair(Status{}, win_collapsed); } else @@ -83,22 +89,25 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen constexpr unsigned int num_elems_processed_per_iteration_y = 2; // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + Window win = calculate_max_window( + *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, + num_elems_processed_per_iteration_y); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, + num_elems_processed_per_iteration_y); const bool window_changed = update_window_and_padding(win, input_access, output_access); Window win_collapsed = win.collapse(win, Window::DimZ); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win_collapsed); } } } // namespace -CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() - : _input(nullptr), _output(nullptr) +CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -108,23 +117,27 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups); } -void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups) +void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; - const DataLayout data_layout = input->info()->data_layout(); - const bool is_nhwc = data_layout == DataLayout::NHWC; - const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); - unsigned int vec_size_x = 0; - unsigned int vec_size_x_leftovers = 0; - if(is_nhwc) + const DataLayout data_layout = input->info()->data_layout(); + const bool is_nhwc = data_layout == DataLayout::NHWC; + const unsigned int channels = + input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); + unsigned int vec_size_x = 0; + unsigned int vec_size_x_leftovers = 0; + if (is_nhwc) { - vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); + vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; } else @@ -170,13 +183,14 @@ void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_cont _config_id += support::cpp11::to_string(output->info()->dimension(1)); _config_id += "_"; _config_id += support::cpp11::to_string(output->info()->dimension(2)); - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } } -Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) +Status +CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h index 31c007f17e..43c939ebd8 100644 --- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h +++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h @@ -60,7 +60,10 @@ public: * @param[out] output Output tensor. Data type supported: Same as @p input * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int num_groups); /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel * * @param[in] input Input tensor info. Data types supported: All. diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp index f4d6316517..f27270733e 100644 --- a/src/core/CL/kernels/CLComparisonKernel.cpp +++ b/src/core/CL/kernels/CLComparisonKernel.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -38,14 +39,10 @@ namespace arm_compute namespace { // Create supported comparisons map -const std::map supported_comparison_ops = -{ - { ComparisonOperation::Equal, "EQUAL" }, - { ComparisonOperation::NotEqual, "NOTEQUAL" }, - { ComparisonOperation::Greater, "GREATER" }, - { ComparisonOperation::GreaterEqual, "GREATEREQUAL" }, - { ComparisonOperation::Less, "LESS" }, - { ComparisonOperation::LessEqual, "LESSEQUAL" }, +const std::map supported_comparison_ops = { + {ComparisonOperation::Equal, "EQUAL"}, {ComparisonOperation::NotEqual, "NOTEQUAL"}, + {ComparisonOperation::Greater, "GREATER"}, {ComparisonOperation::GreaterEqual, "GREATEREQUAL"}, + {ComparisonOperation::Less, "LESS"}, {ComparisonOperation::LessEqual, "LESSEQUAL"}, }; int calculate_num_elems_processed_per_iteration(const ITensorInfo &input) @@ -53,7 +50,10 @@ int calculate_num_elems_processed_per_iteration(const ITensorInfo &input) return 16 / input.element_size(); } -Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation) +Status validate_arguments(const ITensorInfo &input1, + const ITensorInfo &input2, + const ITensorInfo &output, + ComparisonOperation operation) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1); ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN); @@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured output - if(output.total_size() > 0) + if (output.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8); ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), @@ -76,7 +76,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, std::pair validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) { - const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); + const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1); // Auto initialize output if not initialized @@ -90,27 +90,34 @@ std::pair validate_and_configure_window(ITensorInfo &input1, ITe AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration); AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration); - bool window_changed = update_window_and_padding(win_input1, input1_access) - || update_window_and_padding(win_input2, input2_access) - || update_window_and_padding(win, output_access); + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace -CLComparisonKernel::CLComparisonKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +CLComparisonKernel::CLComparisonKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +void CLComparisonKernel::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation); } -void CLComparisonKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +void CLComparisonKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation)); @@ -129,10 +136,11 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons // Set kernel build options std::set build_opts; build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())); - build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info()))); + build_opts.emplace("-DVEC_SIZE=" + + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info()))); build_opts.emplace("-DOP=" + operation_name); build_opts.emplace("-DOP_NAME=" + lower_string(operation_name)); - if(is_data_type_quantized(input1->info()->data_type())) + if (is_data_type_quantized(input1->info()->data_type())) { const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform(); const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform(); @@ -160,12 +168,16 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons _config_id += lower_string(string_from_data_layout(input1->info()->data_layout())); } -Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation) +Status CLComparisonKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation operation) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first); return Status{}; } @@ -181,17 +193,18 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue) bool can_collapse = true; const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) { can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) { can_collapse = (in_shape1[d] == in_shape2[d]); } } bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; @@ -212,16 +225,16 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } BorderSize CLComparisonKernel::border_size() const { const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info()); - const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = std::min(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize{ 0, border, 0, 0 }; + const unsigned int replicateSize = + _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = std::min(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize{0, border, 0, 0}; } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h index 0b94190183..174a6c9bf9 100644 --- a/src/core/CL/kernels/CLComparisonKernel.h +++ b/src/core/CL/kernels/CLComparisonKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLCOMPARISONKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -64,7 +65,11 @@ public: * @param[out] output Destination tensor. Data types supported: U8. * @param[in] operation Comparison operation to use. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation); /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel * * @param[in] input1 Source tensor. Data types supported: All. @@ -74,10 +79,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation operation); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; + void run(const Window &window, cl::CommandQueue &queue) override; BorderSize border_size() const override; private: diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp index 76af5d564a..f8ecc4c098 100644 --- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp +++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" @@ -40,7 +41,8 @@ CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel() _type = CLKernelType::ELEMENTWISE; } -Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, +Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, const PadStrideInfo &info) { ARM_COMPUTE_UNUSED(info); @@ -60,7 +62,7 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c)); - for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i)); } @@ -68,20 +70,21 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co return Status{}; } -void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, - const PadStrideInfo &info) +void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, - const PadStrideInfo &info) +void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PadStrideInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -119,7 +122,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu const int out_end_y = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1; const int out_step_y = _info.stride().second; - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: { @@ -137,8 +140,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu add_3D_tensor_argument(idx, _input, slice_in); add_3D_tensor_argument(idx, _output, slice_out); enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out)); + } while (collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out)); break; } case DataLayout::NHWC: @@ -156,8 +158,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu add_3D_tensor_argument(idx, _input, slice_in); add_3D_tensor_argument(idx, _output, slice_out); enqueue(queue, *this, slice_out, lws_hint()); - } - while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out)); break; } default: diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h index e0d1322341..762989a836 100644 --- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h +++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h @@ -62,7 +62,10 @@ public: * @param[out] output Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PadStrideInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample * * @param[in] input Source tensor info. Data types supported: All. diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp index 0fc0ff8168..b33e0a8b6f 100644 --- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp +++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -38,7 +39,11 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info); @@ -53,19 +58,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first); ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32); - if(!is_qasymm) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S32); + if (!is_qasymm) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info); } - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * + weights_info->dimension(idx_b)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b)); - if(bias != nullptr) + if (bias != nullptr) { - if(is_qasymm) + if (is_qasymm) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -76,19 +83,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b)); } - if(output->total_size() != 0) + if (output->total_size() != 0) { const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info); + auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), + weights_info->dimension(idx_w), weights_info->dimension(idx_h), + stride_info); - const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); + const TensorShape output_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } return Status{}; } -std::pair validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) +std::pair validate_and_configure_window(const ITensorInfo *input, + ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -97,11 +111,17 @@ std::pair validate_and_configure_window(const ITensorInfo *input const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info); + auto out_dims = + deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), + weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info); - const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); + const TensorShape output_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); - auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info())); + auto_init_if_empty(*output, input->clone() + ->set_tensor_shape(output_shape) + .set_data_layout(data_layout) + .set_quantization_info(input->quantization_info())); Window win = calculate_max_window(*input); @@ -109,29 +129,37 @@ std::pair validate_and_configure_window(const ITensorInfo *input } } // namespace -CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() - : _add_bias(false), - _bias(nullptr) +CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() : _add_bias(false), _bias(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, +void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) { configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, input_info, weights_info, deconv_info); } -void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, - const ITensorInfo *weights_info, - const PadStrideInfo &deconv_info) +void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), + output->info(), input_info, weights_info, deconv_info)); - auto padding_info = get_padding_info({ input, bias, output }); + auto padding_info = get_padding_info({input, bias, output}); // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info); + auto win_config = + validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); const DataLayout data_layout = input_info->data_layout(); @@ -178,7 +206,11 @@ void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, +Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info)); @@ -194,7 +226,7 @@ void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQu unsigned int idx = 0; add_3D_tensor_argument(idx, _input, collapsed); add_3D_tensor_argument(idx, _output, collapsed); - if(_add_bias) + if (_add_bias) { add_1D_tensor_argument(idx, _bias, collapsed); } diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h index ce354fa86f..8f436b07e3 100644 --- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h +++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h @@ -67,7 +67,12 @@ public: * @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input. * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported. */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info); + void configure(const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info); /** Initialise the kernel's source and destination. * * @param[in] compile_context The compile context to be used. @@ -79,8 +84,13 @@ public: * @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input. * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, - const PadStrideInfo &deconv_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info); /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionReshapeOutputKernel. * @@ -93,7 +103,12 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp index 5c1dc4fbf6..cdf19ab2e1 100644 --- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp +++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -49,12 +50,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width])); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != + (block_shape * input->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != + (block_shape * input->tensor_shape()[idx_height])); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -63,8 +66,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() - : _input(nullptr), _output(nullptr), _block_shape() +CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() : _input(nullptr), _output(nullptr), _block_shape() { _type = CLKernelType::ELEMENTWISE; } @@ -74,14 +76,18 @@ void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); + TensorShape output_shape = + compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); @@ -98,7 +104,9 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel))); build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape)); build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width))); - _kernel = create_kernel(compile_context, "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel(compile_context, + "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*input->info(), Steps()); @@ -137,7 +145,6 @@ void CLDepthToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu enqueue(queue, *this, slice_in, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_in)); + } while (window.slide_window_slice_3D(slice_in)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h index 1f7f77b569..cef70c4dda 100644 --- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h +++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -61,7 +62,8 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] block_shape Block shape value. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); + void + configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel. * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index e34b6929e7..b95abe795f 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -23,16 +23,17 @@ */ #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/ICLKernel.h" @@ -45,12 +46,18 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ARM_COMPUTE_UNUSED(dwc_info); bool in_place = false; - if(output == nullptr || output == input) + if (output == nullptr || output == input) { in_place = true; output = input; @@ -58,11 +65,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1); ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_to_cl_image(weights) == false), "Weights cannot be exported to cl_image!"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && + (export_to_cl_image(weights) == false), + "Weights cannot be exported to cl_image!"); ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0)); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1); @@ -72,33 +82,40 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * conv_info.depth_multiplier)); // In place restrictions - if(in_place) + if (in_place) { - const int weights_width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const int weights_height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || weights->tensor_shape()[weights_height_idx] != 1U); + const int weights_width_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const int weights_height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || + weights->tensor_shape()[weights_height_idx] != 1U); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier != 1U); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride() != std::make_pair(1U, 1U)); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size2D(1U, 1U)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it + ARM_COMPUTE_RETURN_ERROR_ON( + conv_info.pad_stride_info + .has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it } - const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation }; - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info); + const ConvolutionInfo info{conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), + conv_info.dilation}; + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info); - if(conv_info.depth_multiplier > 1 && dwc_info.n0 > 1) + if (conv_info.depth_multiplier > 1 && dwc_info.n0 > 1) { ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0); } const bool is_quantized = is_data_type_quantized(input->data_type()); - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -108,7 +125,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, } } - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); @@ -116,7 +133,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0)); @@ -134,22 +151,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); } - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } - if(is_data_type_quantized(input->data_type())) + if (is_data_type_quantized(input->data_type())) { const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; + const UniformQuantizationInfo oq_info = + (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); } return Status{}; @@ -171,30 +190,48 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel _type = CLKernelType::DEPTHWISE; } -void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) +void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers, + const ICLTensor *output_shifts) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info, + output_multipliers, output_shifts); } -void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) +void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers, + const ICLTensor *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - if(output == nullptr) + if (output == nullptr) { // In-place output = input; } - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), - dwc_info, conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_info, + conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, + (output_shifts != nullptr) ? output_shifts->info() : nullptr)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info); - auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info())); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape( + *(input->info()), *(weights->info()), conv_info); + auto_init_if_empty(*(output->info()), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_quantization_info(output->info()->quantization_info())); _input = input; _output = output; @@ -214,12 +251,12 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & CLBuildOptions build_opts; // Update the padding for the input/weights tensor if we can export to cl_image - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info()); } - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info()); } @@ -229,9 +266,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & const auto act_function = conv_info.act_info.activation(); const auto dst_data_type = _output->info()->data_type(); - if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16)) + if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || + act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && + (dst_data_type == DataType::F32 || dst_data_type == DataType::F16)) { // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations @@ -268,23 +306,24 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1)); - build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", "-DN0_A=" + support::cpp11::to_string(n0)); + build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", + "-DN0_A=" + support::cpp11::to_string(n0)); build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0)); build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION"); // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll - set_unroll_with_pragma(build_opts, { static_cast(_weights->info()->dimension(1) + m0 - 1), - static_cast(_weights->info()->dimension(1)), - static_cast(_weights->info()->dimension(2)) - }); + set_unroll_with_pragma(build_opts, {static_cast(_weights->info()->dimension(1) + m0 - 1), + static_cast(_weights->info()->dimension(1)), + static_cast(_weights->info()->dimension(2))}); - if(biases != nullptr) + if (biases != nullptr) { build_opts.add_option(std::string("-DHAS_BIAS")); - build_opts.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type()))); + build_opts.add_option( + std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type()))); } - if(_is_quantized) + if (_is_quantized) { kernel_name = "dwc_native_quantized_nhwc"; const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform(); @@ -306,13 +345,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32)); build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32)); - build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" + get_cl_type_from_data_type(_output_multipliers->info()->data_type())); - build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" + get_cl_type_from_data_type(_output_shifts->info()->data_type())); - build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL, "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR"); + build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" + + get_cl_type_from_data_type(_output_multipliers->info()->data_type())); + build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" + + get_cl_type_from_data_type(_output_shifts->info()->data_type())); + build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL, + "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR"); // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach int a_val{}; int b_val{}; - std::tie(b_val, a_val) = get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo); + std::tie(b_val, a_val) = + get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo); build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + support::cpp11::to_string(a_val)); build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + support::cpp11::to_string(b_val)); @@ -321,8 +364,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & { kernel_name = "dwc_native_fp_nhwc"; build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a())); - build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b())); + build_opts.add_option_if(conv_info.act_info.enabled(), + "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a())); + build_opts.add_option_if(conv_info.act_info.enabled(), + "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b())); } Window win = calculate_max_window(*(output->info()), Steps(n0, m0)); @@ -350,10 +395,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & _config_id += string_from_data_type(input->info()->data_type()); } -Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts)); return Status{}; } @@ -370,47 +422,52 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm cl::Image2D input_cl_image; cl::Image2D weights_cl_image; - if(_export_input_to_cl_image || _export_weights_to_cl_image) + if (_export_input_to_cl_image || _export_weights_to_cl_image) { // Export cl_buffer to cl_image - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { - const size_t image_w = _input->info()->dimension(0) / 4; - const size_t image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3); + const size_t image_w = _input->info()->dimension(0) / 4; + const size_t image_h = + _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3); const TensorShape shape2d(image_w, image_h); const size_t image_row_pitch = _input->info()->strides_in_bytes()[1]; - input_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + input_cl_image = + create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, + _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { - const size_t image_w = _weights->info()->dimension(0) / 4; - const size_t image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3); + const size_t image_w = _weights->info()->dimension(0) / 4; + const size_t image_h = + _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3); const TensorShape shape2d(image_w, image_h); const size_t image_row_pitch = _weights->info()->strides_in_bytes()[1]; - weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch, - CLImage2DType::ReadOnly); + weights_cl_image = + create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, + _weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } } unsigned int idx = 0; - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { _kernel.setArg(idx++, input_cl_image); } add_4d_tensor_nhwc_argument(idx, _input); add_4d_tensor_nhwc_argument(idx, _output); - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { _kernel.setArg(idx++, weights_cl_image); } add_4d_tensor_nhwc_argument(idx, _weights); - if(_is_quantized) + if (_is_quantized) { add_1D_tensor_argument(idx, _output_multipliers, slice); add_1D_tensor_argument(idx, _output_shifts, slice); } - if(_biases != nullptr) + if (_biases != nullptr) { add_1D_tensor_argument(idx, _biases, slice); } diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h index 8eee7b2500..d34a662966 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h @@ -24,11 +24,11 @@ #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/function_info/ConvolutionInfo.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { class ICLTensor; @@ -74,15 +74,28 @@ public: * * no padding * * no change of data layout after configure */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers = nullptr, + const ICLTensor *output_shifts = nullptr); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel * * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure() */ - void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); + void configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers = nullptr, + const ICLTensor *output_shifts = nullptr); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel * @@ -90,23 +103,29 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ITensorInfo *output_multipliers = nullptr, + const ITensorInfo *output_shifts = nullptr); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input {}; + const ICLTensor *_input{}; const ICLTensor *_weights{}; const ICLTensor *_biases{}; ICLTensor *_output{}; - unsigned int _depth_multiplier{ 0 }; + unsigned int _depth_multiplier{0}; const ICLTensor *_output_multipliers{}; const ICLTensor *_output_shifts{}; - bool _export_input_to_cl_image{ false }; - bool _export_weights_to_cl_image{ true }; - bool _is_quantized{ false }; + bool _export_input_to_cl_image{false}; + bool _export_weights_to_cl_image{true}; + bool _is_quantized{false}; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */ diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp index 9b514ed705..3d8f875ef7 100644 --- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp +++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -37,17 +38,20 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32); - ARM_COMPUTE_RETURN_ERROR_ON(std::set({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set({0, 1}).count(config.axis) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -57,7 +61,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +std::pair validate_and_configure_window(ITensorInfo *input, + ITensorInfo *output, + ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_UNUSED(idx, config); @@ -69,21 +76,27 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() - : _input(nullptr), _output(nullptr), _idx(nullptr) +CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() : _input(nullptr), _output(nullptr), _idx(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config) +void CLFFTDigitReverseKernel::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *idx, + const FFTDigitReverseKernelInfo &config) { configure(CLKernelLibrary::get().get_compile_context(), input, output, idx, config); } -void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config) +void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx); - auto padding_info = get_padding_info({ input, output, idx }); + auto padding_info = get_padding_info({input, output, idx}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config)); _input = input; @@ -114,10 +127,14 @@ void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); return Status{}; } @@ -137,7 +154,6 @@ void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _output, slice); add_1D_tensor_argument(idx, _idx, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h index e5583a4c22..fdd1bcc3d3 100644 --- a/src/core/CL/kernels/CLFFTDigitReverseKernel.h +++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H #define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { // Forward declarations @@ -56,7 +56,8 @@ public: * @param[in] idx Digit reverse index tensor. Data type supported: U32 * @param[in] config Kernel configuration. */ - void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config); + void + configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -65,7 +66,11 @@ public: * @param[in] idx Digit reverse index tensor. Data type supported: U32 * @param[in] config Kernel configuration. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *idx, + const FFTDigitReverseKernelInfo &config); /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel * * @param[in] input Source tensor info. Data types supported: F16/F32. @@ -75,7 +80,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp index 95f4b640bd..3729e6b77d 100644 --- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp +++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -46,11 +47,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0); - ARM_COMPUTE_RETURN_ERROR_ON(std::set({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set({0, 1}).count(config.axis) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -59,9 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) +std::pair +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) { - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output, *input); } @@ -76,8 +78,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -CLFFTRadixStageKernel::CLFFTRadixStageKernel() - : _input(nullptr), _output(nullptr), _run_in_place(false) +CLFFTRadixStageKernel::CLFFTRadixStageKernel() : _input(nullptr), _output(nullptr), _run_in_place(false) { _type = CLKernelType::ELEMENTWISE; } @@ -87,11 +88,15 @@ void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config) +void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTRadixStageKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); - auto padding_info = get_padding_info({ input, output }); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -110,11 +115,12 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set static arguments if not the first stage - if(!config.is_first_stage) + if (!config.is_first_stage) { const unsigned int Ni = config.Nx * config.radix; const float exp_const = (-2.0 * M_PI) / static_cast(Ni); - unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters + unsigned int idx = + (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters _kernel.setArg(idx++, config.Nx); _kernel.setArg(idx++, Ni); _kernel.setArg(idx, exp_const); @@ -136,21 +142,22 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config) +Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const FFTRadixStageKernelInfo &config) { const bool run_in_place = (output == nullptr) || (output == input); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (run_in_place) ? nullptr : output->clone().get(), - config) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config) + .first); return Status{}; } std::set CLFFTRadixStageKernel::supported_radix() { - return std::set { 2, 3, 4, 5, 7, 8 }; + return std::set{2, 3, 4, 5, 7, 8}; } void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue) @@ -165,12 +172,11 @@ void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue) { unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice); - if(!_run_in_place) + if (!_run_in_place) { add_3D_tensor_argument(idx, _output, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h index 9bb310db83..de80bfced3 100644 --- a/src/core/CL/kernels/CLFFTRadixStageKernel.h +++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H #define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + #include namespace arm_compute @@ -69,7 +69,10 @@ public: * @param[out] output Destination tensor. Can be nullptr. Data type supported: same as @p input * @param[in] config FFT descriptor metadata. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTRadixStageKernelInfo &config); /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel * * @param[in] input Source tensor info. Data types supported: F16/F32. diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp index 8a714d71bf..be6e16b074 100644 --- a/src/core/CL/kernels/CLFFTScaleKernel.cpp +++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -54,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) } } // namespace -CLFFTScaleKernel::CLFFTScaleKernel() - : _input(nullptr), _output(nullptr), _run_in_place(false) +CLFFTScaleKernel::CLFFTScaleKernel() : _input(nullptr), _output(nullptr), _run_in_place(false) { _type = CLKernelType::ELEMENTWISE; } @@ -65,11 +65,14 @@ void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTS configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config) +void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTScaleKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -78,20 +81,22 @@ void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTen // Create kernel CLBuildOptions build_opts; build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() + : input->info()->num_channels())); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option_if(config.conjugate, "-DCONJ"); std::string kernel_name = "fft_scale_conj"; _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set static arguments - unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters + unsigned int idx = + (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters _kernel.setArg(idx, config.scale); // Configure kernel window Window win = calculate_max_window(*input->info(), Steps()); - if(output != nullptr) + if (output != nullptr) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); @@ -130,12 +135,11 @@ void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue) { unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice); - if(!_run_in_place) + if (!_run_in_place) { add_3D_tensor_argument(idx, _output, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h index cc518be193..b995282e02 100644 --- a/src/core/CL/kernels/CLFFTScaleKernel.h +++ b/src/core/CL/kernels/CLFFTScaleKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H #define ARM_COMPUTE_CLFFTSCALEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { // Forward declarations @@ -63,7 +63,10 @@ public: * @param[out] output Destination tensor. Data type supported: same as @p input * @param[in] config Kernel configuration */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTScaleKernelInfo &config); /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel * * @param[in] input Source tensor info. Data types supported: F16/F32. diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp index fcd99a4ed9..86bb502da3 100644 --- a/src/core/CL/kernels/CLFillBorderKernel.cpp +++ b/src/core/CL/kernels/CLFillBorderKernel.cpp @@ -31,14 +31,14 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" #include "support/StringSupport.h" namespace arm_compute { -CLFillBorderKernel::CLFillBorderKernel() - : ICLKernel(), _tensor(nullptr) +CLFillBorderKernel::CLFillBorderKernel() : ICLKernel(), _tensor(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -56,27 +56,38 @@ void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue ICLKernel::add_argument(idx, static_cast(value)); } -void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void CLFillBorderKernel::configure(ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { configure(CLKernelLibrary::get().get_compile_context(), tensor, border_size, border_mode, constant_border_value); } -void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void CLFillBorderKernel::configure(const CLCompileContext &compile_context, + ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { _tensor = tensor; configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value); } -void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void CLFillBorderKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_ERROR_ON(tensor == nullptr); ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1); - auto padding_info = get_padding_info({ tensor }); + auto padding_info = get_padding_info({tensor}); border_size.limit(tensor->padding()); // If there is no border: early exit - if(border_size.empty() || border_mode == BorderMode::UNDEFINED) + if (border_size.empty() || border_mode == BorderMode::UNDEFINED) { return; } @@ -98,25 +109,22 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Create static kernel arguments - const unsigned int valid_width = tensor->valid_region().shape[0]; - const unsigned int valid_height = tensor->valid_region().shape[1]; - const cl_int2 valid_region_coords = - { - { - static_cast(tensor->valid_region().anchor[0]), - static_cast(tensor->valid_region().anchor[1]), - } - }; - const unsigned int total_valid_width = border_size.left + valid_width + border_size.right; + const unsigned int valid_width = tensor->valid_region().shape[0]; + const unsigned int valid_height = tensor->valid_region().shape[1]; + const cl_int2 valid_region_coords = {{ + static_cast(tensor->valid_region().anchor[0]), + static_cast(tensor->valid_region().anchor[1]), + }}; + const unsigned int total_valid_width = border_size.left + valid_width + border_size.right; // Set static kernel arguments unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters ICLKernel::add_argument(idx, valid_width); ICLKernel::add_argument(idx, valid_height); ICLKernel::add_argument(idx, valid_region_coords); - if(BorderMode::CONSTANT == border_mode) + if (BorderMode::CONSTANT == border_mode) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -175,12 +183,13 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { // Border mode undefined or border width == 0 - if(_kernel() == nullptr) + if (_kernel() == nullptr) { return; } - const auto tensor = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto tensor = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); @@ -193,14 +202,13 @@ void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl:: unsigned int idx = 0; add_3D_tensor_argument(idx, tensor, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue) { // Border mode undefined or border width == 0 - if(_kernel() == nullptr) + if (_kernel() == nullptr) { return; } @@ -216,7 +224,6 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_3D_tensor_argument(idx, _tensor, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h index 7951f48171..5782143cf9 100644 --- a/src/core/CL/kernels/CLFillBorderKernel.h +++ b/src/core/CL/kernels/CLFillBorderKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -57,7 +58,11 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(const CLCompileContext &compile_context, + ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Initialise the kernel's input, output and border mode. * * @param[in,out] tensor Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32. @@ -65,7 +70,10 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Initialise the kernel's input, output and border mode. * * @param[in] compile_context The compile context to be used. @@ -74,7 +82,11 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Function to set the constant value on fill border kernel depending on type. * diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp index 68fe324df6..7da0679ae4 100644 --- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp +++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp @@ -30,20 +30,26 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status validate_arguments(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_UNUSED(epsilon); ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); @@ -54,43 +60,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr); ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1); - if(fbn_type == FuseBatchNormalizationType::CONVOLUTION) + if (fbn_type == FuseBatchNormalizationType::CONVOLUTION) { ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0)); } else { - const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); + const size_t channel_idx = + get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0)); } // Validate bias - if(input_bias != nullptr) + if (input_bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias); } // Validate beta - if(bn_beta != nullptr) + if (bn_beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta); } // Validate gamma - if(bn_gamma != nullptr) + if (bn_gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma); } // Validate output weights - if(fused_weights != nullptr && fused_weights->total_size() != 0) + if (fused_weights != nullptr && fused_weights->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights); } // Validate output bias - if(fused_bias != nullptr && fused_bias->total_size() != 0) + if (fused_bias != nullptr && fused_bias->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias); @@ -101,28 +108,52 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b } // namespace CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel() - : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(), - _run_in_place_weights(false), _run_in_place_bias(false) + : _input_weights(nullptr), + _input_bias(nullptr), + _bn_mean(nullptr), + _bn_var(nullptr), + _bn_gamma(nullptr), + _bn_beta(nullptr), + _fused_weights(nullptr), + _fused_bias(nullptr), + _epsilon(), + _run_in_place_weights(false), + _run_in_place_bias(false) { _type = CLKernelType::ELEMENTWISE; } -void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } -void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); - auto padding_info = get_padding_info({ input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma }); + auto padding_info = + get_padding_info({input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma}); _input_weights = input_weights; _input_bias = input_bias; @@ -135,28 +166,28 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c _epsilon = epsilon; _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == input_weights); - _run_in_place_bias = (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias); + _run_in_place_bias = + (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias); // Auto initialize outputs - if(_fused_weights != nullptr) + if (_fused_weights != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone()); } - if(_fused_bias != nullptr) + if (_fused_bias != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone()); } // Validate arguments - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(), - (fused_weights != nullptr) ? fused_weights->info() : nullptr, - (fused_bias != nullptr) ? fused_bias->info() : nullptr, - (input_bias != nullptr) ? input_bias->info() : nullptr, - (bn_beta != nullptr) ? bn_beta->info() : nullptr, - (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, - epsilon, fbn_type)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input_weights->info(), bn_mean->info(), bn_var->info(), + (fused_weights != nullptr) ? fused_weights->info() : nullptr, + (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr, + (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon, + fbn_type)); // Configure kernel window Window win = calculate_max_window(*input_weights->info()); @@ -165,7 +196,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c // Set build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input_weights->info()->data_type())); - build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2))); + build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, + "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2))); build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon)); build_opts.add_option_if(_input_weights->info()->data_layout() == DataLayout::NHWC, "-DNHWC"); build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W"); @@ -180,12 +212,19 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); return Status{}; } @@ -202,25 +241,25 @@ void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl:: // Add kernel arguments unsigned int idx = 0; add_3D_tensor_argument(idx, _input_weights, slice_3d); - if(_input_bias != nullptr) + if (_input_bias != nullptr) { add_1D_tensor_argument(idx, _input_bias, slice_1d); } add_1D_tensor_argument(idx, _bn_mean, slice_1d); add_1D_tensor_argument(idx, _bn_var, slice_1d); - if(!_run_in_place_weights) + if (!_run_in_place_weights) { add_3D_tensor_argument(idx, _fused_weights, slice_3d); } - if(!_run_in_place_bias) + if (!_run_in_place_bias) { add_1D_tensor_argument(idx, _fused_bias, slice_1d); } - if(_bn_beta != nullptr) + if (_bn_beta != nullptr) { add_1D_tensor_argument(idx, _bn_beta, slice_1d); } - if(_bn_gamma != nullptr) + if (_bn_gamma != nullptr) { add_1D_tensor_argument(idx, _bn_gamma, slice_1d); } diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h index 78b1e74cab..76ec7a759f 100644 --- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h +++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h @@ -62,9 +62,16 @@ public: * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION. */ - void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + void configure(const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias = nullptr, + const ICLTensor *bn_beta = nullptr, + const ICLTensor *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Set the source, destination of the kernel * * @param[in] compile_context The compile context to be used. @@ -81,9 +88,17 @@ public: * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias = nullptr, + const ICLTensor *bn_beta = nullptr, + const ICLTensor *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel * * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC @@ -101,10 +116,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + static Status validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias = nullptr, + const ITensorInfo *bn_beta = nullptr, + const ITensorInfo *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp index 5495023b80..c11a18940a 100644 --- a/src/core/CL/kernels/CLGatherKernel.cpp +++ b/src/core/CL/kernels/CLGatherKernel.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLGatherKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -34,7 +36,8 @@ namespace arm_compute { namespace { -inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) +inline Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); const uint32_t actual_axis = wrap_around(axis, static_cast(input->num_dimensions())); @@ -43,11 +46,12 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); - TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->tensor_shape(), indices->tensor_shape(), actual_axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } @@ -56,12 +60,14 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis) +std::pair +validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); const uint32_t actual_axis = wrap_around(axis, static_cast(input->num_dimensions())); // Output auto initialization if not yet initialized - TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->tensor_shape(), indices->tensor_shape(), actual_axis); auto_init_if_empty((*output), output_shape, 1, input->data_type()); // Create window @@ -72,8 +78,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } // namespace -CLGatherKernel::CLGatherKernel() - : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) +CLGatherKernel::CLGatherKernel() : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) { _type = CLKernelType::ELEMENTWISE; } @@ -83,10 +88,14 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices, configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis); } -void CLGatherKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) +void CLGatherKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + int axis) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); - auto padding_info = get_padding_info({ input, output, indices }); + auto padding_info = get_padding_info({input, output, indices}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis)); // Configure kernel window @@ -100,7 +109,8 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC // Set build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2))); build_opts.add_option("-DINDICES_DIM_Z=" + support::cpp11::to_string(indices->info()->dimension(2))); build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); @@ -114,10 +124,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) +Status +CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first); return Status{}; } diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h index 8f472a4696..db4b49d2f5 100644 --- a/src/core/CL/kernels/CLGatherKernel.h +++ b/src/core/CL/kernels/CLGatherKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLGATHERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -63,7 +64,11 @@ public: * @param[out] output Destination tensor. Data type supported: Same as @p input * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0 */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + int axis = 0); /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel * @@ -74,7 +79,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0); + static Status + validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp index 088c454f3c..b9ff72b928 100644 --- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp +++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2); - if(all_anchors->total_size() > 0) + if (all_anchors->total_size() > 0) { size_t feature_height = info.feat_height(); size_t feature_width = info.feat_width(); @@ -57,7 +58,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors); - if(is_data_type_quantized(anchors->data_type())) + if (is_data_type_quantized(anchors->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors); } @@ -66,21 +67,25 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc } } // namespace -CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() - : _anchors(nullptr), _all_anchors(nullptr) +CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() : _anchors(nullptr), _all_anchors(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info) +void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, + ICLTensor *all_anchors, + const ComputeAnchorsInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info); } -void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info) +void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *anchors, + ICLTensor *all_anchors, + const ComputeAnchorsInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors); - auto padding_info = get_padding_info({ anchors, all_anchors }); + auto padding_info = get_padding_info({anchors, all_anchors}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info)); // Metadata @@ -91,7 +96,8 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex // Initialize the output if empty const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors); - auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); + auto_init_if_empty(*all_anchors->info(), + TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); // Set instance variables _anchors = anchors; @@ -108,7 +114,7 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors)); build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi())); - if(is_quantized) + if (is_quantized) { const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform(); build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); @@ -116,8 +122,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex } // Create kernel - const std::string kernel_name = (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + const std::string kernel_name = + (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields). // This means we don't need to pad on the X dimension, as we know in advance how many fields @@ -127,7 +134,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info) +Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, + const ITensorInfo *all_anchors, + const ComputeAnchorsInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info)); return Status{}; diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h index d26795ac7d..e08f281d6c 100644 --- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h +++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h @@ -62,7 +62,10 @@ public: * @param[in] info Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *anchors, + ICLTensor *all_anchors, + const ComputeAnchorsInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel * @@ -81,5 +84,5 @@ private: const ICLTensor *_anchors; ICLTensor *_all_anchors; }; -} // arm_compute +} // namespace arm_compute #endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp index 7ed323c950..b13eb16556 100644 --- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -39,17 +40,20 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.epsilon == 0.f, "Epsilon must be different than 0"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); } return Status{}; @@ -59,27 +63,30 @@ Status validate_arguments_meanvar(const ITensorInfo *input, const ITensorInfo *o { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); } return Status{}; } } // namespace -CLComputeMeanVariance::CLComputeMeanVariance() - : _input(nullptr), _output(nullptr) +CLComputeMeanVariance::CLComputeMeanVariance() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision) +void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + bool use_mixed_precision) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output == nullptr ? input : output; @@ -88,7 +95,8 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); CLBuildOptions build_opts; - build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.add_option("-DINTERNAL_DATA_TYPE=" + + (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type()))); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0))); @@ -108,7 +116,7 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I const TensorShape out_shape(input_channel, 2u, input_batches); // Output auto initialization if not yet initialized - if(use_mixed_precision) + if (use_mixed_precision) { auto_init_if_empty(*_output->info(), out_shape, 1, DataType::F32); } @@ -134,7 +142,7 @@ void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue) Window collapsed_window = window.collapse(window, Window::DimZ); // We will process the planes together - if(_input->info()->data_layout() == DataLayout::NCHW) + if (_input->info()->data_layout() == DataLayout::NCHW) { collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -157,10 +165,14 @@ CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info) +void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *mean_var, + ICLTensor *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output == nullptr ? input : output; @@ -172,7 +184,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision + ? "float" + : get_cl_type_from_data_type(input->info()->data_type()))); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0))); build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1))); @@ -188,7 +202,7 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi // Configure kernel window Window win = calculate_max_window(*input->info(), Steps(1)); - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type()); } @@ -197,7 +211,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info) +Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info)); return Status{}; @@ -211,7 +227,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu Window collapsed_window = window.collapse(window, Window::DimZ); // We will process the planes together - if(_input->info()->data_layout() == DataLayout::NCHW) + if (_input->info()->data_layout() == DataLayout::NCHW) { collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -226,7 +242,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu add_4D_tensor_argument(idx, _input, collapsed_window); add_3D_tensor_argument(idx, _mean, collapsed_window); - if(!_run_in_place) + if (!_run_in_place) { add_4D_tensor_argument(idx, _output, collapsed_window); } diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h index 2f9014a651..9f436da7f6 100644 --- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h +++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { // Forward declarations @@ -59,7 +59,11 @@ public: * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. * @param[in] info Kernel meta-data descriptor */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *mean_var, + ICLTensor *output, + const InstanceNormalizationLayerKernelInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer. * @@ -69,7 +73,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -106,7 +111,8 @@ public: * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. * @param[in] use_mixed_precision Use mixed precision in case of FP16 execution */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision); + void + configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision); /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer. * diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp index 542d380e4a..9ed9d7c5b0 100644 --- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp +++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp @@ -31,10 +31,10 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute @@ -43,7 +43,8 @@ namespace { constexpr int max_input_tensor_dim = 3; -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_UNUSED(epsilon); @@ -53,14 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, + "Actual normalization axis greater than max number of dimensions"); // Reduce shape on axis TensorShape sum_shape = input->tensor_shape(); sum_shape.set(actual_axis, 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -78,16 +80,22 @@ CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon) +void CLL2NormalizeLayerKernel::configure( + const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon) { configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, axis, epsilon); } -void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon) +void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *sum, + ICLTensor *output, + int axis, + float epsilon) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon)); - auto padding_info = get_padding_info({ input, sum, output }); + auto padding_info = get_padding_info({input, sum, output}); _input = input; _sum = sum; @@ -95,8 +103,9 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context _actual_axis = wrap_around(axis, max_input_tensor_dim); _epsilon = epsilon; - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); - const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; + const unsigned int vec_size_x = + adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); + const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; // Set build options CLBuildOptions build_opts; @@ -107,7 +116,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context // Create kernel std::string kernel_name; unsigned int idx = 0; - switch(_actual_axis) + switch (_actual_axis) { case 0: kernel_name = "l2_normalize_x"; @@ -127,7 +136,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set epsilon argument - if(input->info()->data_type() == DataType::F32) + if (input->info()->data_type() == DataType::F32) { _kernel.setArg(idx, _epsilon); } @@ -146,7 +155,8 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status CLL2NormalizeLayerKernel::validate( + const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon)); return Status{}; @@ -159,7 +169,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue Window window_sum(window); - switch(_actual_axis) + switch (_actual_axis) { case 0: { @@ -173,8 +183,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue add_2D_tensor_argument(idx, _sum, sum_slice); add_2D_tensor_argument(idx, _output, in_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); + } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); } break; case 1: @@ -189,8 +198,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue add_2D_tensor_argument(idx, _sum, sum_slice); add_2D_tensor_argument(idx, _output, in_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); + } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); } break; case 2: @@ -205,8 +213,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue add_3D_tensor_argument(idx, _sum, sum_slice); add_3D_tensor_argument(idx, _output, in_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice)); + } while (window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice)); } break; default: diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h index edc0585217..5c9ab94ce5 100644 --- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h +++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -70,7 +71,12 @@ public: * @param[in] axis Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2 * @param[in] epsilon Lower bound value for the normalization. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *sum, + ICLTensor *output, + int axis, + float epsilon); /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel. * @@ -84,7 +90,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); + static Status + validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp index dc9d68626d..e560f1de4a 100644 --- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,26 +43,31 @@ using namespace misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices); - int pool_stride_x = 0; - int pool_stride_y = 0; - PoolingType pool_type = pool_info.pool_type; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + int pool_stride_x = 0; + int pool_stride_y = 0; + PoolingType pool_type = pool_info.pool_type; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_size_x = pool_info.pool_size.width; - const int pool_size_y = pool_info.pool_size.height; + const int pool_size_x = pool_info.pool_size.width; + const int pool_size_y = pool_info.pool_size.height; const Size2D pool_size(pool_size_x, pool_size_y); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, + "Pooling indices only supported for MAX pooling method"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -71,17 +77,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() - : _input(nullptr), _output(nullptr), _indices(nullptr) +CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() : _input(nullptr), _output(nullptr), _indices(nullptr) { _type = CLKernelType::POOL; } -void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) +void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info())); - auto padding_info = get_padding_info({ input, indices, output }); + auto padding_info = get_padding_info({input, indices, output}); _input = input; _output = output; @@ -119,7 +128,10 @@ void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices)); @@ -140,7 +152,6 @@ void CLMaxUnpoolingLayerKernel::run(const Window &window, cl::CommandQueue &queu add_3D_tensor_argument(idx, _output, slice); add_3D_tensor_argument(idx, _indices, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h index 45481d0507..eb18a46784 100644 --- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h +++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h @@ -59,7 +59,11 @@ public: * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel * * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -72,7 +76,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info); // Inherited methods overridden void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp index ac33468ad8..8632bdf623 100644 --- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp +++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -49,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -69,15 +70,19 @@ void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon); } -void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon) +void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float epsilon) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); _run_in_place = (output == nullptr) || (output == input); - ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); + ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate( + input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output->info(), *input->info()); } @@ -85,7 +90,8 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_ _input = input; _output = output; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); // Set build options CLBuildOptions build_opts; @@ -134,7 +140,6 @@ void CLMeanStdDevNormalizationKernel::run(const Window &window, cl::CommandQueue add_2D_tensor_argument_if((!_run_in_place), idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h index a1ba2b905e..e02a3c58a3 100644 --- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h +++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h @@ -66,7 +66,10 @@ public: * @param[out] output (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output = nullptr, + float epsilon = 1e-8f); /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel * * @param[in] input Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr, diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp index c6c4229c00..b636c485e7 100644 --- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp @@ -32,6 +32,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Window.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -53,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -63,7 +64,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info) +std::pair +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*output, *input->clone()); @@ -71,9 +73,10 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen bool window_changed = false; Window win; const DataLayout data_layout = input->data_layout(); - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); + const unsigned int vec_size_x = + adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info); const bool is_norm_across_width = norm_idx == 0; @@ -87,15 +90,16 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen // The output has 1 right padding because of the vec_size_x. // The input has 1 left padding because radius = 1. // The input has 2 right padding because of radius = 1 AND because of the extra output padding - const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0; - const unsigned int border_width_right = is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0; - const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left); + const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0; + const unsigned int border_width_right = + is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0; + const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left); win = calculate_max_window(*input, Steps(vec_size_x)); // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding // Reads can occur within the valid region of the input - if(is_norm_across_width) + if (is_norm_across_width) { AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0); window_changed = window_changed || update_window_and_padding(win, input_access); @@ -112,13 +116,14 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen else { unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); - if(norm_info.is_cross_map()) + if (norm_info.is_cross_map()) { vec_size_x = 1; } win = calculate_max_window(*input, Steps(vec_size_x)); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -139,10 +144,13 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info); } -void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info) +void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + NormalizationLayerInfo norm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info)); @@ -152,16 +160,17 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte _input = input; _output = output; - const DataLayout data_layout = input->info()->data_layout(); - unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); - int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; - if(norm_info.is_cross_map() && data_layout == DataLayout::NHWC) + const DataLayout data_layout = input->info()->data_layout(); + unsigned int vec_size_x = + adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); + int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; + if (norm_info.is_cross_map() && data_layout == DataLayout::NHWC) { vec_size_x = 1; vec_size_x_leftovers = 0; } - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { const unsigned int norm_idx = get_normalization_dimension_index(data_layout, norm_info); _is_norm_across_width = norm_idx == 0; @@ -175,9 +184,10 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte // The output has 1 right padding because of the vec_size_x. // The input has 1 left padding because radius = 1. // The input has 2 right padding because of radius = 1 AND the extra output padding - const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0; - const unsigned int border_width_right = _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0; - _border_size = BorderSize(0, border_width_right, 0, border_width_left); + const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0; + const unsigned int border_width_right = + _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0; + _border_size = BorderSize(0, border_width_right, 0, border_width_left); } const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D); @@ -193,12 +203,14 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2))); build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2)))); build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D"); - build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0))); - build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC, "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), + "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC, + "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1))); // Create kernel std::string kernel_name; - if(norm_info.is_in_map()) + if (norm_info.is_in_map()) { kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout)); } @@ -222,16 +234,19 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte _config_id += support::cpp11::to_string(input->info()->dimension(0)); _config_id += "_"; _config_id += support::cpp11::to_string(input->info()->dimension(1)); - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } } -Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info) +Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + NormalizationLayerInfo norm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first); return Status{}; } @@ -251,7 +266,6 @@ void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &que add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h index 739a2ae9f1..5517ba6904 100644 --- a/src/core/CL/kernels/CLNormalizationLayerKernel.h +++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h @@ -63,7 +63,10 @@ public: * Data layouts supported: same as @p input. * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + NormalizationLayerInfo norm_info); /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], @@ -77,7 +80,7 @@ public: static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; + void run(const Window &window, cl::CommandQueue &queue) override; BorderSize border_size() const override; private: diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp index 6b0400d50e..59352a8fb7 100644 --- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp +++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp @@ -31,32 +31,35 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std); ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors"); - const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int channel_idx = + get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0)); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -77,7 +80,8 @@ std::pair validate_and_configure_window_nchw(ITensorInfo *input, bool window_changed = update_window_and_padding(win, input_access, output_access); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -88,12 +92,19 @@ CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std); } -void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std); @@ -102,7 +113,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ // Output tensor auto initialization if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -112,9 +123,10 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ const DataLayout data_layout = input->info()->data_layout(); // Get number of elements to process per iterations - const unsigned int num_elems_processed_per_iteration = (data_layout == DataLayout::NHWC) ? adjust_vec_size(16 / input->info()->element_size(), - input->info()->dimension(0)) : - (16 / input->info()->element_size()); + const unsigned int num_elems_processed_per_iteration = + (data_layout == DataLayout::NHWC) + ? adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)) + : (16 / input->info()->element_size()); const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); const DataType dt = input->info()->data_type(); @@ -122,11 +134,12 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ CLBuildOptions build_opts; build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration))); + build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration))); build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx)))); std::string kernel_name = "normalize_planar_yuv_layer_"; - if(is_data_type_quantized(dt)) + if (is_data_type_quantized(dt)) { const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform(); build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(qinfo.offset))); @@ -139,7 +152,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); @@ -165,12 +178,16 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ _config_id += support::cpp11::to_string(input->info()->dimension(2)); } -Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std) +Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *std) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std)); - if(input->data_layout() == DataLayout::NCHW) + if (input->data_layout() == DataLayout::NCHW) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first); } return Status{}; } @@ -196,7 +213,6 @@ void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h index 6db4433e78..341b404e3d 100644 --- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h +++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h @@ -67,7 +67,11 @@ public: * @param[in] std Standard deviation values tensor. 1 dimension with size equal to the number of input channels. * Data types supported: same as @p input */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std); /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel * * @param[in] input Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels]. @@ -79,7 +83,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp index 53f313c0d3..0ac285038e 100644 --- a/src/core/CL/kernels/CLPadLayerKernel.cpp +++ b/src/core/CL/kernels/CLPadLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -35,25 +36,29 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_UNUSED(constant_value); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON((padding.size() < 1) || (padding.size() > input->num_dimensions())); - if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC) + if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC) { ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3); const auto is_reflect = static_cast(mode == PaddingMode::REFLECT); - for(size_t i = 0; i < padding.size(); ++i) + for (size_t i = 0; i < padding.size(); ++i) { ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect)); ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect)); } } - if(output->total_size() > 0) + if (output->total_size() > 0) { TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); @@ -65,41 +70,51 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLPadLayerKernel::CLPadLayerKernel() - : _input(nullptr), _output(nullptr), _4d_enabled(false) +CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _4d_enabled(false) { _type = CLKernelType::ELEMENTWISE; } -void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayerKernel::configure( + const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) { configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode); } -void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding))); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3); // Set build options - const DataType &data_type = input->info()->data_type(); - const unsigned int input_width = input->info()->dimension(0); - const unsigned int input_height = input->info()->dimension(1); - const unsigned int input_depth = input->info()->dimension(2); - const unsigned int pad_x_before = padding.at(0).first; - const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0; - const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0; - const unsigned int vec_size = adjust_vec_size(std::min(16U, 32U / static_cast(element_size_from_data_type(input->info()->data_type()))), input_width); - const unsigned int pad_right_start = input_width + pad_x_before; - const unsigned int pad_x_before_remainder = pad_x_before % vec_size; - const unsigned int vec_size_leftover_write = vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0)); + const DataType &data_type = input->info()->data_type(); + const unsigned int input_width = input->info()->dimension(0); + const unsigned int input_height = input->info()->dimension(1); + const unsigned int input_depth = input->info()->dimension(2); + const unsigned int pad_x_before = padding.at(0).first; + const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0; + const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0; + const unsigned int vec_size = adjust_vec_size( + std::min(16U, 32U / static_cast(element_size_from_data_type(input->info()->data_type()))), + input_width); + const unsigned int pad_right_start = input_width + pad_x_before; + const unsigned int pad_x_before_remainder = pad_x_before % vec_size; + const unsigned int vec_size_leftover_write = + vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0)); CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); @@ -108,12 +123,12 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width)); build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder)); build_opts.add_option("-DVEC_SIZE_LEFTOVER_WRITE=" + support::cpp11::to_string(vec_size_leftover_write)); - if(padding.size() > 1) + if (padding.size() > 1) { build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before)); build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height)); - if(padding.size() > 2) + if (padding.size() > 2) { build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before)); build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth)); @@ -121,23 +136,25 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const } std::string kernel_name = "pad_layer_"; - switch(mode) + switch (mode) { case PaddingMode::CONSTANT: { kernel_name += "constant"; - const unsigned int vec_size_leftover_read = vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start); + const unsigned int vec_size_leftover_read = + vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start); build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type)); build_opts.add_option("-DVEC_SIZE_LEFTOVER_READ=" + support::cpp11::to_string(vec_size_leftover_read)); - if(pad_x_before >= vec_size) + if (pad_x_before >= vec_size) { build_opts.add_option("-DTHREADS_TO_SKIP_BEFORE=" + support::cpp11::to_string(pad_x_before / vec_size)); - build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + support::cpp11::to_string(pad_right_start / vec_size)); + build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + + support::cpp11::to_string(pad_right_start / vec_size)); } - if(_4d_enabled) + if (_4d_enabled) { build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first)); build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3))); @@ -154,14 +171,17 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const const unsigned int pad_x_after_remainder = pad_right_start % vec_size; const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect; - const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size); + const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size); build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect)); build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder)); - build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size)); - build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size)); + build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size)); + build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size)); build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x)); - build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size)); + build_opts.add_option_if(after_pad_fact_x < output_last_x, + "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size)); break; } @@ -179,7 +199,11 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +Status CLPadLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode)); return Status{}; @@ -197,13 +221,12 @@ void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); - if(_4d_enabled) + if (_4d_enabled) { add_argument(idx, batch++); } enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h index 90af337f94..dca121b6a1 100644 --- a/src/core/CL/kernels/CLPadLayerKernel.h +++ b/src/core/CL/kernels/CLPadLayerKernel.h @@ -56,7 +56,11 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ - void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT); + void configure(const ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); /** Set the input and output tensor. * * @param[in] compile_context The compile context to be used. @@ -68,8 +72,12 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), - PaddingMode mode = PaddingMode::CONSTANT); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel * * @param[in] input Source tensor info. Data types supported: All. @@ -80,7 +88,11 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp index bf1b874dd0..7dcdf1de6f 100644 --- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp +++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" using namespace arm_compute::misc::shape_calculator; @@ -42,7 +42,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status validate_arguments(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32); @@ -51,10 +54,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, // Check variances const int var_size = info.variances().size(); - if(var_size > 1) + if (var_size > 1) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values"); - for(int i = 0; i < var_size; ++i) + for (int i = 0; i < var_size; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0"); } @@ -62,17 +65,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0"); - if(!info.max_sizes().empty()) + if (!info.max_sizes().empty()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), + "Max and min sizes dimensions should match"); } - for(unsigned int i = 0; i < info.max_sizes().size(); ++i) + for (unsigned int i = 0; i < info.max_sizes().size(); ++i) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], + "Max size should be greater than min size"); } - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2); } @@ -80,7 +85,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, return Status{}; } -std::pair validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors) +std::pair validate_and_configure_window(const ITensorInfo *input1, + const ITensorInfo *input2, + ITensorInfo *output, + const PriorBoxLayerInfo &info, + int num_priors) { ARM_COMPUTE_UNUSED(input2); // Output tensor auto initialization if not yet initialized @@ -88,10 +97,11 @@ std::pair validate_and_configure_window(const ITensorInfo *input auto_init_if_empty(*output, output_shape, 1, input1->data_type()); const unsigned int num_elems_processed_per_iteration = 4 * num_priors; - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); bool window_changed = update_window_and_padding(win, output_access); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -102,13 +112,25 @@ CLPriorBoxLayerKernel::CLPriorBoxLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios) +void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info, min, max, aspect_ratios); } -void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, - cl::Buffer *max, cl::Buffer *aspect_ratios) +void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); @@ -135,7 +157,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c int img_width = info.img_size().x; int img_height = info.img_size().y; - if(img_width == 0 || img_height == 0) + if (img_width == 0 || img_height == 0) { img_width = input2->info()->dimension(width_idx); img_height = input2->info()->dimension(height_idx); @@ -143,7 +165,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c float step_x = info.steps()[0]; float step_y = info.steps()[0]; - if(step_x == 0.f || step_y == 0.f) + if (step_x == 0.f || step_y == 0.f) { step_x = static_cast(img_width) / layer_width; step_y = static_cast(img_height) / layer_height; @@ -162,18 +184,20 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset())); build_opts.add_option_if(info.clip(), "-DIN_PLACE"); - if(info.variances().size() > 1) + if (info.variances().size() > 1) { - for(unsigned int i = 0; i < info.variances().size(); ++i) + for (unsigned int i = 0; i < info.variances().size(); ++i) { - build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i))); + build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(info.variances().at(i))); } } else { - for(unsigned int i = 0; i < 4; ++i) + for (unsigned int i = 0; i < 4; ++i) { - build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0))); + build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(info.variances().at(0))); } } @@ -194,13 +218,17 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c ICLKernel::configure_internal(win_config.second); } -Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info)); const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), + output->clone().get(), info, num_priors) + .first); return Status{}; } @@ -211,8 +239,9 @@ void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data()); - queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data()); - if(!_info.max_sizes().empty()) + queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), + _info.aspect_ratios().data()); + if (!_info.max_sizes().empty()) { queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data()); } diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h index 6c369a7a4e..a50e0c5ff5 100644 --- a/src/core/CL/kernels/CLPriorBoxLayerKernel.h +++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h @@ -57,7 +57,13 @@ public: * @param[in] max Maximum prior box values * @param[in] aspect_ratios Aspect ratio values */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios); + void configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -69,8 +75,14 @@ public: * @param[in] max Maximum prior box values * @param[in] aspect_ratios Aspect ratio values */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, - cl::Buffer *aspect_ratios); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios); /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel * * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC. @@ -80,14 +92,17 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input1; - const ICLTensor *_input2; + const ICLTensor *_input1; + const ICLTensor *_input2; ICLTensor *_output; PriorBoxLayerInfo _info; int _num_priors; diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp index bd573e54c8..731fcb8e04 100644 --- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp +++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp @@ -22,10 +22,12 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -49,14 +51,19 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size(); /* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */ /* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */ - const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration; + const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) + ? input->dimension(0) + : temp_num_elems_processed_per_iteration; // This kernel doesn't need padding Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); return std::make_pair(Status{}, win); } -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *weight, + const ITensorInfo *bias) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output); @@ -72,7 +79,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -87,10 +94,14 @@ CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel() _type = CLKernelType::ELEMENTWISE; } -void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias) +void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *weight, + const ICLTensor *bias) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output); - auto padding_info = get_padding_info({ input, weight, bias, output }); + auto padding_info = get_padding_info({input, weight, bias, output}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info())); @@ -104,7 +115,8 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_ int32_t output_multiplier{}; int32_t output_shift{}; const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform(); - const Status status = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift); + const Status status = + quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift); output_shift *= -1; // Set build options @@ -114,8 +126,12 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_ build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); - build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); - build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); + build_opts.add_option("-DMIN_BOUND=" + + support::cpp11::to_string(std::get<0>( + quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); + build_opts.add_option("-DMAX_BOUND=" + + support::cpp11::to_string(std::get<1>( + quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); // Create kernel _kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options()); @@ -135,12 +151,18 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias) +void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *weight, + const ICLTensor *bias) { configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias); } -Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *weight, + const ITensorInfo *bias) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); @@ -171,7 +193,6 @@ void CLQLSTMLayerNormalizationKernel::run(const Window &window, cl::CommandQueue add_2D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h index 31085c37ba..ba912e1d2d 100644 --- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h +++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h @@ -63,7 +63,11 @@ public: * @param[in] weight Weight tensor. Data types supported: Same as @p input. * @param[in] bias Bias tensor. Data types supported: S32. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *weight, + const ICLTensor *bias); /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel * * @param[in] input Source tensor info with 2 dimensions. Data types supported: QSYMM16. @@ -73,7 +77,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp index 69a6fa5fa0..c97910ef79 100644 --- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp +++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,24 +43,29 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5); ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), + output->tensor_shape()); } - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16); @@ -82,12 +88,19 @@ CLROIAlignLayerKernel::CLROIAlignLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayerKernel::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info)); @@ -97,7 +110,7 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); output->info()->set_data_layout(input->info()->data_layout()); - auto padding_info = get_padding_info({ input, rois, output }); + auto padding_info = get_padding_info({input, rois, output}); _input = input; _output = output; @@ -111,16 +124,23 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type())); - build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH)))); - build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT)))); - build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL)))); + build_opts.add_option("-DMAX_DIM_X=" + + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index( + input->info()->data_layout(), DataLayoutDimension::WIDTH)))); + build_opts.add_option("-DMAX_DIM_Y=" + + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index( + input->info()->data_layout(), DataLayoutDimension::HEIGHT)))); + build_opts.add_option("-DMAX_DIM_Z=" + + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index( + input->info()->data_layout(), DataLayoutDimension::CHANNEL)))); build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width())); build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())); build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale())); build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC"); - build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio())); + build_opts.add_option_if(pool_info.sampling_ratio() > 0, + "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio())); - if(is_qasymm) + if (is_qasymm) { const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); const UniformQuantizationInfo roisq_info = rois->info()->quantization_info().uniform(); @@ -144,7 +164,10 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info)); return Status{}; diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h index 5284a5913f..2e84e5d303 100644 --- a/src/core/CL/kernels/CLROIAlignLayerKernel.h +++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h @@ -61,7 +61,8 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void + configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -77,7 +78,11 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel * * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -93,7 +98,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue); diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp index f6933c6cfd..1b2c414a49 100644 --- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -48,7 +49,10 @@ CLROIPoolingLayerKernel::CLROIPoolingLayerKernel() _type = CLKernelType::ELEMENTWISE; } -Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output); @@ -61,10 +65,11 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height())); + ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || + (output->dimension(1) != pool_info.pooled_height())); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2)); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3)); } @@ -72,20 +77,30 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor return Status{}; } -void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayerKernel::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + const ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { - ARM_COMPUTE_ERROR_THROW_ON(CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info)); + ARM_COMPUTE_ERROR_THROW_ON( + CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info)); - auto padding_info = get_padding_info({ input, rois, output }); + auto padding_info = get_padding_info({input, rois, output}); // Output auto initialization if not yet initialized - TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1)); - auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(), output->info()->quantization_info()); + TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), + rois->info()->dimension(1)); + auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(), + output->info()->quantization_info()); // Set instance variables _input = input; @@ -107,11 +122,12 @@ void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())); build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale())); - if(is_qasymm) + if (is_qasymm) { // Determine quantization info scale, offset UniformQuantizationInfo uqinfo = UniformQuantizationInfo(); - uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform()); + uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), + _output->info()->quantization_info().uniform()); build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset)); build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale)); diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h index 7b7b457632..80bfb63092 100644 --- a/src/core/CL/kernels/CLROIPoolingLayerKernel.h +++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h @@ -59,7 +59,8 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void + configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -74,7 +75,11 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + const ICLTensor *output, + const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -92,7 +97,10 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); private: const ICLTensor *_input; diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp index a06c2eed75..622f6210b9 100644 --- a/src/core/CL/kernels/CLRangeKernel.cpp +++ b/src/core/CL/kernels/CLRangeKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,11 +43,8 @@ constexpr unsigned int vector_size_byte_opencl = 16; Status validate_arguments(const ITensorInfo *output, const float start, const float end, const float step) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, - DataType::U16, DataType::S16, - DataType::U32, DataType::S32, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::QASYMM8, + DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output); @@ -56,19 +54,22 @@ Status validate_arguments(const ITensorInfo *output, const float start, const fl ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), "start value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), "end value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), "step value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), + "start value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), + "end value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), + "step value is outside the range of the data type"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->num_dimensions() != 1, "Output has to be a 1-D tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), + "Output tensor size is incorrect"); return Status{}; } } // namespace -CLRangeKernel::CLRangeKernel() - : _start(0), _end(1), _step(1), _output(nullptr) +CLRangeKernel::CLRangeKernel() : _start(0), _end(1), _step(1), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -78,16 +79,18 @@ void CLRangeKernel::configure(ICLTensor *output, const float start, const float configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step); } -void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) +void CLRangeKernel::configure( + const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) { ARM_COMPUTE_ERROR_ON_NULLPTR(output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(output->info(), start, end, step)); // Configure kernel window - unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0)); - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0)); + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - auto padding_info = get_padding_info({ output }); + auto padding_info = get_padding_info({output}); _start = start; _end = end; @@ -100,10 +103,11 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DSTART=" + support::cpp11::to_string(start)); build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step)); - if(is_data_type_quantized_asymmetric(output->info()->data_type())) + if (is_data_type_quantized_asymmetric(output->info()->data_type())) { const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform(); build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(qinfo.offset)); diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h index 1b94a099ed..65251a11e5 100644 --- a/src/core/CL/kernels/CLRangeKernel.h +++ b/src/core/CL/kernels/CLRangeKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLRANGEKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index e5cfb997ca..70875a2d40 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -28,15 +28,15 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute @@ -47,23 +47,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - if(input->num_channels() == 1) + if (input->num_channels() == 1) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); } else { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(axis == 0); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, + "Not supported reduction operation for QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && (input->data_type() != DataType::QASYMM8) - && (input->data_type() != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer"); + ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && + (input->data_type() != DataType::QASYMM8) && + (input->data_type() != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), + "Not supported reduction operation, use CLArgMinMaxLayer"); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -79,33 +84,42 @@ CLReductionOperationKernel::CLReductionOperationKernel() _type = CLKernelType::ELEMENTWISE; } -void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLReductionOperationKernel::configure(const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op); } -void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; _reduction_axis = axis; _op = op; - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true)); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true)); // Set build options CLBuildOptions build_opts; DataType data_type = input->info()->data_type(); std::string data_type_promoted{}; - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { data_type_promoted = "int"; } @@ -130,10 +144,14 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD"); build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN"); build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX"); - build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset)); - build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale)); - - switch(op) + build_opts.add_option_if(is_data_type_quantized(data_type), + "-DOFFSET=" + + support::cpp11::to_string(input->info()->quantization_info().uniform().offset)); + build_opts.add_option_if( + is_data_type_quantized(data_type), + "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale)); + + switch (op) { case ReductionOperation::SUM_SQUARE: build_opts.add_option(("-DOPERATION=square_sum")); @@ -159,7 +177,7 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte std::string kernel_axis_name; const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis); - switch(axis) + switch (axis) { case 0: { @@ -187,13 +205,17 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte // Configure kernel window Window win = calculate_max_window(*input->info(), Steps(vec_size)); - win.set(Window::DimX, Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step())); + win.set(Window::DimX, + Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step())); ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status CLReductionOperationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); return Status{}; @@ -205,18 +227,19 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis); - switch(_reduction_axis) + switch (_reduction_axis) { case 0: { // We use parallel reduction only in non quantized types - if(is_serial_op) + if (is_serial_op) { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); + Window window_in{window}; + window_in.set(Window::DimX, + Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); - Window out_window{ window }; + Window out_window{window}; out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); Window in_slice = window_in.first_slice_window_1D(); @@ -228,8 +251,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_1D_tensor_argument(idx, _input, in_slice); add_1D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice)); + } while (window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice)); } else { @@ -251,8 +273,9 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que case 1: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); + Window window_in{window}; + window_in.set(Window::DimY, + Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); Window in_slice = window_in.first_slice_window_2D(); Window out_slice = window.first_slice_window_2D(); @@ -262,15 +285,15 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_2D_tensor_argument(idx, _input, in_slice); add_2D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); } break; case 2: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); + Window window_in{window}; + window_in.set(Window::DimZ, + Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); Window in_slice = window_in.first_slice_window_3D(); Window out_slice = window.first_slice_window_3D(); @@ -280,14 +303,13 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_3D_tensor_argument(idx, _input, in_slice); add_3D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); + } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); } break; case 3: { // Get first input and output slices - Window window_in{ window }; + Window window_in{window}; window_in.set(3, Window::Dimension(0, 1, 1)); Window in_slice = window_in.first_slice_window_4D(); Window out_slice = window.first_slice_window_4D(); @@ -298,8 +320,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_4D_tensor_argument(idx, _input, in_slice); add_4D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); + } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); } break; default: diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h index b456378746..2f94b2add3 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.h +++ b/src/core/CL/kernels/CLReductionOperationKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -67,7 +68,11 @@ public: * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 * @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op); /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel. * @@ -79,7 +84,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp index 3c74e80d33..9fd21943e8 100644 --- a/src/core/CL/kernels/CLReorgLayerKernel.cpp +++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp @@ -28,9 +28,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -51,13 +52,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, + "The width of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, + "The height of the input tensor must be a multiple of stride"); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); + const TensorInfo tensor_info_output = + output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -66,8 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -CLReorgLayerKernel::CLReorgLayerKernel() - : _input(nullptr), _output(nullptr) +CLReorgLayerKernel::CLReorgLayerKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -77,17 +80,22 @@ void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, in configure(CLKernelLibrary::get().get_compile_context(), input, output, stride); } -void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride) +void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t stride) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; - std::string kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout())); - const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); + std::string kernel_name = + std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout())); + const size_t idx_channel = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); // Create kernel CLBuildOptions build_opts; @@ -98,7 +106,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons // Configure window // auto inizialize the output tensor if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride))); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride))); Window win = calculate_max_window(*output->info(), Steps()); @@ -119,7 +129,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride) +Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, + const arm_compute::ITensorInfo *output, + int32_t stride) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride)); @@ -139,7 +151,6 @@ void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h index 455a6170c6..f335071e9f 100644 --- a/src/core/CL/kernels/CLReorgLayerKernel.h +++ b/src/core/CL/kernels/CLReorgLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLREORGLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp index 0d70ff4f3c..79a0f03b1e 100644 --- a/src/core/CL/kernels/CLReverseKernel.cpp +++ b/src/core/CL/kernels/CLReverseKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -49,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -60,8 +61,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLReverseKernel::CLReverseKernel() - : _input(nullptr), _output(nullptr), _axis(nullptr) +CLReverseKernel::CLReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -71,10 +71,13 @@ void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const configure(CLKernelLibrary::get().get_compile_context(), input, output, axis); } -void CLReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) +void CLReverseKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *axis) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis); - auto padding_info = get_padding_info({ input, output, axis }); + auto padding_info = get_padding_info({input, output, axis}); _input = input; _output = output; @@ -138,7 +141,6 @@ void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue) add_1D_tensor_argument(idx, _axis, axis_slice); add_4D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_4D(slice)); + } while (collapsed.slide_window_slice_4D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h index 4a21e4f802..fbd99dc883 100644 --- a/src/core/CL/kernels/CLReverseKernel.h +++ b/src/core/CL/kernels/CLReverseKernel.h @@ -60,7 +60,10 @@ public: * @param[out] output Output tensor. Data type supported: Same as @p input * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32 */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *axis); /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel * diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp index c0e014e8b8..703c64d8d3 100644 --- a/src/core/CL/kernels/CLSelectKernel.cpp +++ b/src/core/CL/kernels/CLSelectKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute @@ -51,9 +51,11 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape())); - ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); + ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && + ((c->tensor_shape().num_dimensions() > 1) || + (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output); @@ -63,13 +65,16 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen } } // namespace -CLSelectKernel::CLSelectKernel() - : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false) +CLSelectKernel::CLSelectKernel() : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false) { _type = CLKernelType::ELEMENTWISE; } -void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output) +void CLSelectKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *c, + const ICLTensor *x, + const ICLTensor *y, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info())); @@ -80,7 +85,7 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC _output = output; _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions()); - auto padding_info = get_padding_info({ c, x, y, output }); + auto padding_info = get_padding_info({c, x, y, output}); const unsigned int vec_size_x = adjust_vec_size(16 / x->info()->element_size(), x->info()->dimension(0)); const int vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x; @@ -92,14 +97,14 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC // Create kernel std::string kernel_name = "select"; - if(_has_same_rank) + if (_has_same_rank) { kernel_name += "_same_rank"; } else { const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2; - if(is_input_rank_greater_than_two) + if (is_input_rank_greater_than_two) { const size_t width = x->info()->tensor_shape().x(); const size_t height = x->info()->tensor_shape().y(); @@ -128,7 +133,8 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) +Status +CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output)); return Status{}; @@ -142,7 +148,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); - if(!_has_same_rank) + if (!_has_same_rank) { Window vector_slice = window.first_slice_window_1D(); vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0)); @@ -153,7 +159,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu do { unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor(); - if(_has_same_rank) + if (_has_same_rank) { add_3D_tensor_argument(idx, _c, slice); } @@ -162,7 +168,6 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h index b8c10cd7cf..c4256fd743 100644 --- a/src/core/CL/kernels/CLSelectKernel.h +++ b/src/core/CL/kernels/CLSelectKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLSELECTKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -60,7 +61,11 @@ public: * @param[out] y Second input tensor. Data types supported: Same as @p x * @param[in] output Output tensor. Data types supported: Same as @p x. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *c, + const ICLTensor *x, + const ICLTensor *y, + ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel * * @param[in] c Condition input tensor. Data types supported: U8. diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp index 3632ae2b03..f4c0839ad2 100644 --- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp +++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -38,19 +39,22 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *block_info, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2}); ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2}); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); @@ -61,7 +65,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status validate_arguments_static(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); @@ -70,9 +78,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right); + TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input, block_shape_x, block_shape_y, padding_left, padding_right); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -88,16 +97,24 @@ CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output); } -void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); - auto padding_info = get_padding_info({ input, block_shape, paddings, output }); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); + auto padding_info = get_padding_info({input, block_shape, paddings, output}); _input = input; _block_shape = block_shape; @@ -111,14 +128,17 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width))); build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height))); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch))); build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width))); build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height))); build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch))); - _kernel = create_kernel(compile_context, "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel(compile_context, + "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -126,22 +146,34 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, - ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, + padding_right, output); } -void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, - const Size2D &padding_right, - ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input->info(), block_shape_x, block_shape_y, padding_left, padding_right); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, + padding_right, output->info())); _input = input; _output = output; @@ -153,7 +185,8 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width))); build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height))); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch))); @@ -166,22 +199,32 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x())); build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y())); build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y())); - _kernel = create_kernel(compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel( + compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); ICLKernel::configure_internal(win); } -Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output)); return Status{}; } -Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -218,7 +261,6 @@ void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queu add_3D_tensor_argument(idx, _output, slice_out); enqueue(queue, *this, slice_out, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h index 4817cfeef2..f9dce9db47 100644 --- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h +++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -63,7 +64,11 @@ public: * @param[in] paddings 2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32 * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output); /** Initialise the kernel's input and output. (Static block shape and paddings) * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -73,7 +78,12 @@ public: * @param[in] padding_right The padding at the end of every dimension of the output tensor. * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output); + void configure(const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output); /** Initialise the kernel's input and output. (Static block shape and paddings) * * @param[in] compile_context The compile context to be used. @@ -84,8 +94,13 @@ public: * @param[in] padding_right The padding at the end of every dimension of the output tensor. * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, - ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -95,7 +110,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings) * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -107,7 +125,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp index c5ffdb588b..25662b5c62 100644 --- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp +++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -45,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -64,8 +65,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() - : _input(nullptr), _output(nullptr), _block_shape() +CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() : _input(nullptr), _output(nullptr), _block_shape() { _type = CLKernelType::ELEMENTWISE; } @@ -75,10 +75,13 @@ void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape); auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); @@ -94,11 +97,14 @@ void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_contex // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type()))); build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_channel))); build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape)); build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(output->info()->dimension(idx_width))); - _kernel = create_kernel(compile_context, "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel(compile_context, + "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -136,7 +142,6 @@ void CLSpaceToDepthLayerKernel::run(const Window &window, cl::CommandQueue &queu enqueue(queue, *this, slice_out, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h index bb1ac5f9a6..d0932919e0 100644 --- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h +++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -61,7 +62,8 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] block_shape Block shape value. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); + void + configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel. * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp index 075c93ab60..23e26716e7 100644 --- a/src/core/CL/kernels/CLStackLayerKernel.cpp +++ b/src/core/CL/kernels/CLStackLayerKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" using namespace arm_compute::misc::shape_calculator; @@ -42,7 +42,11 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); @@ -51,9 +55,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + compute_stack_shape(*input, axis, num_tensors)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); } @@ -61,7 +66,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) +std::pair +validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors))); @@ -73,18 +79,23 @@ std::pair validate_and_configure_window(ITensorInfo *input, unsi } } // namespace -CLStackLayerKernel::CLStackLayerKernel() - : _input(nullptr), _output(nullptr) +CLStackLayerKernel::CLStackLayerKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output) +void CLStackLayerKernel::configure( + const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, axis, idx_input, num_tensors, output); } -void CLStackLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output) +void CLStackLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info())); @@ -112,10 +123,15 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons _kernel.setArg(idx, idx_input); } -Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status CLStackLayerKernel::validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); return Status{}; } diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h index 2865127a90..d3c17f529c 100644 --- a/src/core/CL/kernels/CLStackLayerKernel.h +++ b/src/core/CL/kernels/CLStackLayerKernel.h @@ -26,6 +26,7 @@ #define ARM_COMPUTE_CLSTACKLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -60,7 +61,8 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. * */ - void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output); + void configure( + const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output); /** Initialise the kernel's inputs and output * * @note Supported input tensor rank: up to 4 @@ -74,7 +76,12 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel * * @note Supported input tensor rank: up to 4 @@ -88,7 +95,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp index 9acbafdb19..a8f6112820 100644 --- a/src/core/CL/kernels/CLStridedSliceKernel.cpp +++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp @@ -22,11 +22,13 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLStridedSliceKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/helpers/bit_ops.h" @@ -37,9 +39,14 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); @@ -48,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) - { - return i == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; })); // Get expected output shape - const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0); // Checks output if configured - if(output->total_size() != 0) + if (output->total_size() != 0) { const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info); @@ -76,28 +80,33 @@ CLStridedSliceKernel::CLStridedSliceKernel() _type = CLKernelType::ELEMENTWISE; } -void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, output }); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + auto padding_info = get_padding_info({input, output}); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); const TensorShape &input_shape = input->tensor_shape(); Coordinates starts_abs; Coordinates ends_abs; Coordinates final_strides; - std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords( - input_shape, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + std::tie(starts_abs, ends_abs, final_strides) = + arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides, + begin_mask, end_mask, shrink_axis_mask); // Configure kernel window - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape)); Window win = calculate_max_window(*output, Steps()); @@ -108,29 +117,33 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0); // Update window if needed - if(multi_access_x) + if (multi_access_x) { Window &updated_window = win; updated_window.set(Window::DimX, - Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x)); + Window::Dimension(updated_window.x().start(), + ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); // Create build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type()))); - for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i) + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type()))); + for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i) { const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i); - build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i])); - build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i])); + build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(starts_abs[i])); + build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(final_strides[i])); build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i)); } - build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(output_width_x - vec_size_x, 0))); + build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string( + std::max(output_width_x - vec_size_x, 0))); build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option_if_else(input_shape.num_dimensions() > 2, - "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()), - "-DSRC_DEPTH=1"); + "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()), "-DSRC_DEPTH=1"); build_opts.add_option_if_else(output->num_dimensions() > 2, "-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()), "-DDST_DEPTH=1"); @@ -142,7 +155,7 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co _config_id = "strided_slice"; _config_id += "_"; _config_id += lower_string(string_from_data_type(input->data_type())); - for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i) + for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i) { _config_id += "_"; _config_id += support::cpp11::to_string(input->dimension(i)); @@ -156,11 +169,17 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSliceKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); return Status{}; } @@ -170,8 +189,9 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = window_collapsed.first_slice_window_4D(); @@ -182,7 +202,6 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl add_4D_tensor_argument(idx, src, slice); add_4D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_4D(slice)); + } while (window_collapsed.slide_window_slice_4D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h index 4c201504f5..1cf5bcacec 100644 --- a/src/core/CL/kernels/CLStridedSliceKernel.h +++ b/src/core/CL/kernels/CLStridedSliceKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" #include @@ -53,9 +54,15 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel * @@ -71,9 +78,14 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp index 3e7015cfd2..fa996c4008 100644 --- a/src/core/CL/kernels/CLTileKernel.cpp +++ b/src/core/CL/kernels/CLTileKernel.cpp @@ -22,9 +22,11 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLTileKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -39,15 +41,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4); ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) - { - return e == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; })); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -55,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLTileKernel::CLTileKernel() - : _input(nullptr), _output(nullptr) +CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -66,7 +65,10 @@ void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Mu configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples); } -void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples) +void CLTileKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Multiples &multiples) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -104,15 +106,14 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT // Configure window without padding Window win = calculate_max_window(*output->info()); - if(multi_access_x) + if (multi_access_x) { // If multi-access is enabled, no thread should cross the tile boundaries. This means we need // as many threads as those to cover a single tile times multiples[0]. Note that if threads // do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and // we don't need to pad the output const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0]; - win.set(Window::DimX, - Window::Dimension(win.x().start(), size_win_x, vec_size_x)); + win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x)); } ICLKernel::configure_internal(win); @@ -121,7 +122,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT _config_id = "tile"; _config_id += "_"; _config_id += lower_string(string_from_data_type(input->info()->data_type())); - for(unsigned int i = 0; i < multiples.size(); ++i) + for (unsigned int i = 0; i < multiples.size(); ++i) { _config_id += "_"; _config_id += support::cpp11::to_string(input->info()->dimension(i)); @@ -150,7 +151,6 @@ void CLTileKernel::run(const Window &window, cl::CommandQueue &queue) add_4D_tensor_argument(idx, _input, slice); add_4D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_4D(slice)); + } while (collapsed.slide_window_slice_4D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h index 41752ca90b..c3486aecef 100644 --- a/src/core/CL/kernels/CLTileKernel.h +++ b/src/core/CL/kernels/CLTileKernel.h @@ -64,7 +64,10 @@ public: * @param[out] output Destination tensor. Same as @p input * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Multiples &multiples); /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel * * @param[in] input Source tensor info. Data type supported: All. diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp index 6a3f66fd5a..9980db42f3 100644 --- a/src/core/CPP/CPPTypes.cpp +++ b/src/core/CPP/CPPTypes.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CPP/CPPTypes.h" #include "arm_compute/core/Error.h" + #include "src/common/cpuinfo/CpuInfo.h" #include "src/common/cpuinfo/CpuIsaInfo.h" @@ -43,8 +44,7 @@ CPUInfo &CPUInfo::get() return _cpuinfo; } -CPUInfo::CPUInfo() - : _impl(std::make_unique()) +CPUInfo::CPUInfo() : _impl(std::make_unique()) { _impl->info = cpuinfo::CpuInfo::build(); } diff --git a/src/core/CPP/Validate.h b/src/core/CPP/Validate.h index df192b5131..fe253508cf 100644 --- a/src/core/CPP/Validate.h +++ b/src/core/CPP/Validate.h @@ -38,8 +38,8 @@ namespace arm_compute * * @return Status */ -inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, - const ITensorInfo *tensor_info) +inline Status +error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info) { bool fp16_kernels_enabled = false; #if defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS) @@ -47,8 +47,9 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi #endif /* defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS) */ ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled), - function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above"); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG( + (tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled), function, + file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above"); return Status{}; } @@ -61,8 +62,8 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi * * @return Status */ -inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, - const ITensorInfo *tensor_info) +inline Status +error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info) { bool bf16_kernels_enabled = false; #if defined(ARM_COMPUTE_ENABLE_BF16) @@ -70,8 +71,9 @@ inline Status error_on_unsupported_cpu_bf16(const char *function, const char *fi #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled), - function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above"); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG( + (tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled), + function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above"); return Status{}; } @@ -84,8 +86,8 @@ inline Status error_on_unsupported_cpu_bf16(const char *function, const char *fi * * @return Status */ -inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, - const ITensor *tensor) +inline Status +error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensor *tensor) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info())); @@ -101,8 +103,8 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi * * @return Status */ -inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, - const ITensor *tensor) +inline Status +error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensor *tensor) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(function, file, line, tensor->info())); diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp index 0f405d8e83..02686eb4f6 100644 --- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp +++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h" #include "arm_compute/core/Helpers.h" + #include "src/core/helpers/WindowHelpers.h" #include @@ -34,7 +35,11 @@ namespace arm_compute namespace { template -std::vector SoftNMS(const ITensor *proposals, std::vector> &scores_in, std::vector inds, const BoxNMSLimitInfo &info, int class_id) +std::vector SoftNMS(const ITensor *proposals, + std::vector> &scores_in, + std::vector inds, + const BoxNMSLimitInfo &info, + int class_id) { std::vector keep; const int proposals_width = proposals->info()->dimension(1); @@ -45,7 +50,7 @@ std::vector SoftNMS(const ITensor *proposals, std::vector> & std::vector y2(proposals_width); std::vector areas(proposals_width); - for(int i = 0; i < proposals_width; ++i) + for (int i = 0; i < proposals_width; ++i) { x1[i] = *reinterpret_cast(proposals->ptr_to_element(Coordinates(class_id * 4, i))); y1[i] = *reinterpret_cast(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i))); @@ -56,13 +61,13 @@ std::vector SoftNMS(const ITensor *proposals, std::vector> & // Note: Soft NMS scores have already been initialized with input scores - while(!inds.empty()) + while (!inds.empty()) { // Find proposal with max score among remaining proposals int max_pos = 0; - for(unsigned int i = 1; i < inds.size(); ++i) + for (unsigned int i = 1; i < inds.size(); ++i) { - if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)]) + if (scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)]) { max_pos = i; } @@ -75,7 +80,7 @@ std::vector SoftNMS(const ITensor *proposals, std::vector> & inds.erase(inds.begin()); std::vector sorted_indices_temp; - for(auto idx : inds) + for (auto idx : inds) { const auto xx1 = std::max(x1[idx], x1[element]); const auto yy1 = std::max(y1[idx], y1[element]); @@ -89,7 +94,7 @@ std::vector SoftNMS(const ITensor *proposals, std::vector> & // Update scores based on computed IoU, overlap threshold and NMS method T weight; - switch(info.soft_nms_method()) + switch (info.soft_nms_method()) { case NMSType::LINEAR: weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f; @@ -106,7 +111,7 @@ std::vector SoftNMS(const ITensor *proposals, std::vector> & // Discard boxes with new scores below min threshold and update pending indices scores_in[class_id][idx] *= weight; - if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres()) + if (scores_in[class_id][idx] >= info.soft_nms_min_score_thres()) { sorted_indices_temp.push_back(idx); } @@ -118,7 +123,10 @@ std::vector SoftNMS(const ITensor *proposals, std::vector> & } template -std::vector NonMaximaSuppression(const ITensor *proposals, std::vector sorted_indices, const BoxNMSLimitInfo &info, int class_id) +std::vector NonMaximaSuppression(const ITensor *proposals, + std::vector sorted_indices, + const BoxNMSLimitInfo &info, + int class_id) { std::vector keep; @@ -130,7 +138,7 @@ std::vector NonMaximaSuppression(const ITensor *proposals, std::vector std::vector y2(proposals_width); std::vector areas(proposals_width); - for(int i = 0; i < proposals_width; ++i) + for (int i = 0; i < proposals_width; ++i) { x1[i] = *reinterpret_cast(proposals->ptr_to_element(Coordinates(class_id * 4, i))); y1[i] = *reinterpret_cast(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i))); @@ -139,7 +147,7 @@ std::vector NonMaximaSuppression(const ITensor *proposals, std::vector areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0); } - while(!sorted_indices.empty()) + while (!sorted_indices.empty()) { int i = sorted_indices.at(0); keep.push_back(i); @@ -148,7 +156,7 @@ std::vector NonMaximaSuppression(const ITensor *proposals, std::vector std::vector new_indices; sorted_indices_temp.erase(sorted_indices_temp.begin()); - for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j) + for (unsigned int j = 0; j < sorted_indices_temp.size(); ++j) { const float xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]); const float yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]); @@ -163,8 +171,9 @@ std::vector NonMaximaSuppression(const ITensor *proposals, std::vector const float ctr_y = yy1 + (h / 2); // If suppress_size is specified, filter the boxes based on their size and position - const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && ctr_x < info.im_width() && ctr_y < info.im_height()); - if(ovr <= info.nms() && keep_size) + const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && + ctr_x < info.im_width() && ctr_y < info.im_height()); + if (ovr <= info.nms() && keep_size) { new_indices.push_back(j); } @@ -172,7 +181,7 @@ std::vector NonMaximaSuppression(const ITensor *proposals, std::vector const unsigned int new_indices_size = new_indices.size(); std::vector new_sorted_indices(new_indices_size); - for(unsigned int i = 0; i < new_indices_size; ++i) + for (unsigned int i = 0; i < new_indices_size; ++i) { new_sorted_indices[i] = sorted_indices[new_indices[i] + 1]; } @@ -184,7 +193,15 @@ std::vector NonMaximaSuppression(const ITensor *proposals, std::vector } // namespace CPPBoxWithNonMaximaSuppressionLimitKernel::CPPBoxWithNonMaximaSuppressionLimitKernel() - : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr), + : _scores_in(nullptr), + _boxes_in(nullptr), + _batch_splits_in(nullptr), + _scores_out(nullptr), + _boxes_out(nullptr), + _classes(nullptr), + _batch_splits_out(nullptr), + _keeps(nullptr), + _keeps_size(nullptr), _info() { } @@ -197,7 +214,7 @@ bool CPPBoxWithNonMaximaSuppressionLimitKernel::is_parallelisable() const template void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() { - const int batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0); + const int batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0); const int num_classes = _scores_in->info()->dimension(0); const int scores_count = _scores_in->info()->dimension(1); std::vector total_keep_per_batch(batch_size); @@ -205,51 +222,48 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() int total_keep_count = 0; std::vector> in_scores(num_classes, std::vector(scores_count)); - for(int i = 0; i < scores_count; ++i) + for (int i = 0; i < scores_count; ++i) { - for(int j = 0; j < num_classes; ++j) + for (int j = 0; j < num_classes; ++j) { in_scores[j][i] = *reinterpret_cast(_scores_in->ptr_to_element(Coordinates(j, i))); } } int cur_start_idx = 0; - for(int b = 0; b < batch_size; ++b) + for (int b = 0; b < batch_size; ++b) { // Skip first class if there is more than 1 except if the number of classes is 1. const int j_start = (num_classes == 1 ? 0 : 1); - for(int j = j_start; j < num_classes; ++j) + for (int j = j_start; j < num_classes; ++j) { std::vector cur_scores(scores_count); std::vector inds; - for(int i = 0; i < scores_count; ++i) + for (int i = 0; i < scores_count; ++i) { const T score = in_scores[j][i]; cur_scores[i] = score; - if(score > _info.score_thresh()) + if (score > _info.score_thresh()) { inds.push_back(i); } } - if(_info.soft_nms_enabled()) + if (_info.soft_nms_enabled()) { keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j); } else { std::sort(inds.data(), inds.data() + inds.size(), - [&cur_scores](int lhs, int rhs) - { - return cur_scores[lhs] > cur_scores[rhs]; - }); + [&cur_scores](int lhs, int rhs) { return cur_scores[lhs] > cur_scores[rhs]; }); keeps[j] = NonMaximaSuppression(_boxes_in, inds, _info, j); } total_keep_count += keeps[j].size(); } - if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im()) + if (_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im()) { // merge all scores (represented by indices) together and sort auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]() @@ -257,10 +271,10 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() std::vector ret(total_keep_count); int ret_idx = 0; - for(unsigned int i = 1; i < keeps.size(); ++i) + for (unsigned int i = 1; i < keeps.size(); ++i) { auto &cur_keep = keeps[i]; - for(auto &ckv : cur_keep) + for (auto &ckv : cur_keep) { ret[ret_idx++] = in_scores[i][ckv]; } @@ -273,13 +287,13 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() auto all_scores_sorted = get_all_scores_sorted(); const T image_thresh = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()]; - for(int j = 1; j < num_classes; ++j) + for (int j = 1; j < num_classes; ++j) { auto &cur_keep = keeps[j]; std::vector new_keeps_j; - for(auto &k : cur_keep) + for (auto &k : cur_keep) { - if(in_scores[j][k] >= image_thresh) + if (in_scores[j][k] >= image_thresh) { new_keeps_j.push_back(k); } @@ -293,40 +307,52 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() // Write results int cur_out_idx = 0; - for(int j = j_start; j < num_classes; ++j) + for (int j = j_start; j < num_classes; ++j) { - auto &cur_keep = keeps[j]; - auto cur_out_scores = reinterpret_cast(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx))); - auto cur_out_classes = reinterpret_cast(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx))); - const int box_column = (cur_start_idx + cur_out_idx) * 4; - - for(unsigned int k = 0; k < cur_keep.size(); ++k) + auto &cur_keep = keeps[j]; + auto cur_out_scores = + reinterpret_cast(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx))); + auto cur_out_classes = + reinterpret_cast(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx))); + const int box_column = (cur_start_idx + cur_out_idx) * 4; + + for (unsigned int k = 0; k < cur_keep.size(); ++k) { - cur_out_scores[k] = in_scores[j][cur_keep[k]]; - cur_out_classes[k] = static_cast(j); - auto cur_out_box_row0 = reinterpret_cast(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k))); - auto cur_out_box_row1 = reinterpret_cast(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k))); - auto cur_out_box_row2 = reinterpret_cast(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k))); - auto cur_out_box_row3 = reinterpret_cast(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k))); - *cur_out_box_row0 = *reinterpret_cast(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k]))); - *cur_out_box_row1 = *reinterpret_cast(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k]))); - *cur_out_box_row2 = *reinterpret_cast(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k]))); - *cur_out_box_row3 = *reinterpret_cast(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k]))); + cur_out_scores[k] = in_scores[j][cur_keep[k]]; + cur_out_classes[k] = static_cast(j); + auto cur_out_box_row0 = + reinterpret_cast(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k))); + auto cur_out_box_row1 = + reinterpret_cast(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k))); + auto cur_out_box_row2 = + reinterpret_cast(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k))); + auto cur_out_box_row3 = + reinterpret_cast(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k))); + *cur_out_box_row0 = + *reinterpret_cast(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k]))); + *cur_out_box_row1 = + *reinterpret_cast(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k]))); + *cur_out_box_row2 = + *reinterpret_cast(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k]))); + *cur_out_box_row3 = + *reinterpret_cast(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k]))); } cur_out_idx += cur_keep.size(); } - if(_keeps != nullptr) + if (_keeps != nullptr) { cur_out_idx = 0; - for(int j = 0; j < num_classes; ++j) + for (int j = 0; j < num_classes; ++j) { - for(unsigned int i = 0; i < keeps[j].size(); ++i) + for (unsigned int i = 0; i < keeps[j].size(); ++i) { - *reinterpret_cast(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast(keeps[j].at(i)); + *reinterpret_cast(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = + static_cast(keeps[j].at(i)); } - *reinterpret_cast(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = keeps[j].size(); + *reinterpret_cast(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = + keeps[j].size(); cur_out_idx += keeps[j].size(); } } @@ -334,17 +360,25 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() cur_start_idx += total_keep_count; } - if(_batch_splits_out != nullptr) + if (_batch_splits_out != nullptr) { - for(int b = 0; b < batch_size; ++b) + for (int b = 0; b < batch_size; ++b) { *reinterpret_cast(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b]; } } } -void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes, - ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info) +void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, + const ITensor *boxes_in, + const ITensor *batch_splits_in, + ITensor *scores_out, + ITensor *boxes_out, + ITensor *classes, + ITensor *batch_splits_out, + ITensor *keeps, + ITensor *keeps_size, + const BoxNMSLimitInfo info) { ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32); @@ -352,25 +386,28 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_ const unsigned int num_classes = scores_in->info()->dimension(0); ARM_COMPUTE_UNUSED(num_classes); - ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes"); - ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows"); + ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), + "First dimension of input boxes must be of size 4*num_classes"); + ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), + "Input scores and input boxes must have the same number of rows"); ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1)); ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4); ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != classes->info()->dimension(0)); - if(keeps != nullptr) + if (keeps != nullptr) { - ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output"); + ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, + "keeps_size cannot be nullptr if keeps has to be provided as output"); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keeps_size, 1, DataType::U32); ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0)); ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0)); } - if(batch_splits_in != nullptr) + if (batch_splits_in != nullptr) { ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in); } - if(batch_splits_out != nullptr) + if (batch_splits_out != nullptr) { ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out); } @@ -399,7 +436,7 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run(const Window &window, const ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window); - switch(_scores_in->info()->data_type()) + switch (_scores_in->info()->data_type()) { case DataType::F32: run_nmslimit(); diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp index c1187ff2b3..1224ec14a7 100644 --- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp +++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp @@ -35,15 +35,22 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size, - const float score_threshold, const float iou_threshold) +Status validate_arguments(const ITensorInfo *bboxes, + const ITensorInfo *scores, + const ITensorInfo *output_indices, + unsigned int max_output_size, + const float score_threshold, + const float iou_threshold) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, output_indices); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_indices, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, + "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, + "The scores tensor must be a 1-D float tensor of shape [num_boxes]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1, + "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->dimension(0) == 0, "Indices tensor must be bigger than 0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0"); @@ -55,15 +62,26 @@ Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, } // namespace CPPNonMaximumSuppressionKernel::CPPNonMaximumSuppressionKernel() - : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0) + : _input_bboxes(nullptr), + _input_scores(nullptr), + _output_indices(nullptr), + _max_output_size(0), + _score_threshold(0.f), + _iou_threshold(0.f), + _num_boxes(0) { } -void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices, - unsigned int max_output_size, const float score_threshold, const float iou_threshold) +void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, + const ITensor *input_scores, + ITensor *output_indices, + unsigned int max_output_size, + const float score_threshold, + const float iou_threshold) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_bboxes, input_scores, output_indices); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), max_output_size, score_threshold, iou_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), + max_output_size, score_threshold, iou_threshold)); auto_init_if_empty(*output_indices->info(), TensorShape(max_output_size), 1, DataType::U8, QuantizationInfo()); @@ -82,10 +100,15 @@ void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, cons ICPPKernel::configure(win); } -Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, - unsigned int max_output_size, const float score_threshold, const float iou_threshold) +Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, + const ITensorInfo *scores, + const ITensorInfo *output_indices, + unsigned int max_output_size, + const float score_threshold, + const float iou_threshold) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold)); return Status{}; } @@ -99,10 +122,10 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo // Auxiliary tensors std::vector indices_above_thd; std::vector scores_above_thd; - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { const float score_i = *(reinterpret_cast(_input_scores->ptr_to_element(Coordinates(i)))); - if(score_i >= _score_threshold) + if (score_i >= _score_threshold) { scores_above_thd.emplace_back(score_i); indices_above_thd.emplace_back(i); @@ -114,12 +137,9 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo std::vector sorted_indices; sorted_indices.resize(num_above_thd); std::iota(sorted_indices.data(), sorted_indices.data() + num_above_thd, 0); - std::sort(std::begin(sorted_indices), - std::end(sorted_indices), + std::sort(std::begin(sorted_indices), std::end(sorted_indices), [&](unsigned int first, unsigned int second) - { - return scores_above_thd[first] > scores_above_thd[second]; - }); + { return scores_above_thd[first] > scores_above_thd[second]; }); // Number of output is the minimum between max_detection and the scores above the threshold const unsigned int num_output = std::min(_max_output_size, num_above_thd); @@ -127,19 +147,20 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo std::vector visited(num_above_thd, false); // Keep only boxes with small IoU - for(unsigned int i = 0; i < num_above_thd; ++i) + for (unsigned int i = 0; i < num_above_thd; ++i) { // Check if the output is full - if(output_idx >= num_output) + if (output_idx >= num_output) { break; } // Check if it was already visited, if not add it to the output and update the indices counter - if(!visited[sorted_indices[i]]) + if (!visited[sorted_indices[i]]) { - *(reinterpret_cast(_output_indices->ptr_to_element(Coordinates(output_idx)))) = indices_above_thd[sorted_indices[i]]; - visited[sorted_indices[i]] = true; + *(reinterpret_cast(_output_indices->ptr_to_element(Coordinates(output_idx)))) = + indices_above_thd[sorted_indices[i]]; + visited[sorted_indices[i]] = true; ++output_idx; } else @@ -148,28 +169,36 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo } // Once added one element at the output check if the next ones overlap and can be skipped - for(unsigned int j = i + 1; j < num_above_thd; ++j) + for (unsigned int j = i + 1; j < num_above_thd; ++j) { - if(!visited[sorted_indices[j]]) + if (!visited[sorted_indices[j]]) { // Calculate IoU const unsigned int i_index = indices_above_thd[sorted_indices[i]]; const unsigned int j_index = indices_above_thd[sorted_indices[j]]; // Box-corner format: xmin, ymin, xmax, ymax - const auto box_i_xmin = *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(0, i_index)))); - const auto box_i_ymin = *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(1, i_index)))); - const auto box_i_xmax = *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(2, i_index)))); - const auto box_i_ymax = *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(3, i_index)))); - - const auto box_j_xmin = *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(0, j_index)))); - const auto box_j_ymin = *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(1, j_index)))); - const auto box_j_xmax = *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(2, j_index)))); - const auto box_j_ymax = *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(3, j_index)))); + const auto box_i_xmin = + *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(0, i_index)))); + const auto box_i_ymin = + *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(1, i_index)))); + const auto box_i_xmax = + *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(2, i_index)))); + const auto box_i_ymax = + *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(3, i_index)))); + + const auto box_j_xmin = + *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(0, j_index)))); + const auto box_j_ymin = + *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(1, j_index)))); + const auto box_j_xmax = + *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(2, j_index)))); + const auto box_j_ymax = + *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(3, j_index)))); const float area_i = (box_i_xmax - box_i_xmin) * (box_i_ymax - box_i_ymin); const float area_j = (box_j_xmax - box_j_xmin) * (box_j_ymax - box_j_ymin); float overlap; - if(area_i <= 0 || area_j <= 0) + if (area_i <= 0 || area_j <= 0) { overlap = 0.0f; } @@ -179,11 +208,12 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo const auto x_min_intersection = std::max(box_i_xmin, box_j_xmin); const auto y_max_intersection = std::min(box_i_ymax, box_j_ymax); const auto x_max_intersection = std::min(box_i_xmax, box_j_xmax); - const auto area_intersection = std::max(y_max_intersection - y_min_intersection, 0.0f) * std::max(x_max_intersection - x_min_intersection, 0.0f); - overlap = area_intersection / (area_i + area_j - area_intersection); + const auto area_intersection = std::max(y_max_intersection - y_min_intersection, 0.0f) * + std::max(x_max_intersection - x_min_intersection, 0.0f); + overlap = area_intersection / (area_i + area_j - area_intersection); } - if(overlap > _iou_threshold) + if (overlap > _iou_threshold) { visited[sorted_indices[j]] = true; } @@ -192,7 +222,7 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo } // The output could be full but not the output indices tensor // Instead return values not valid we put -1 - for(; output_idx < _max_output_size; ++output_idx) + for (; output_idx < _max_output_size; ++output_idx) { *(reinterpret_cast(_output_indices->ptr_to_element(Coordinates(output_idx)))) = -1; } diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp index 054c7bf05a..e68090d82b 100644 --- a/src/core/CPP/kernels/CPPPermuteKernel.cpp +++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm); // Validate configured output - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -65,7 +66,7 @@ void CPPPermuteKernel::run_permute(const Window &window) // Create output window Window window_out(window); const Window::Dimension zero_window = Window::Dimension(0, 0, 0); - for(size_t d = 0; d <= _perm.num_dimensions(); ++d) + for (size_t d = 0; d <= _perm.num_dimensions(); ++d) { window_out.set(d, zero_window); } @@ -74,28 +75,32 @@ void CPPPermuteKernel::run_permute(const Window &window) Iterator in(_input, window); Iterator out(_output, window_out); - if(_input->info()->num_dimensions() <= 3) + if (_input->info()->num_dimensions() <= 3) { - execute_window_loop(window, [&](const Coordinates & id) - { - const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2]; - *(reinterpret_cast(out.ptr() + idx)) = *(reinterpret_cast(in.ptr())); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2]; + *(reinterpret_cast(out.ptr() + idx)) = *(reinterpret_cast(in.ptr())); + }, + in, out); } - else if(_input->info()->num_dimensions() >= 4) + else if (_input->info()->num_dimensions() >= 4) { - execute_window_loop(window, [&](const Coordinates & id) - { - const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_strides[3]; - *(reinterpret_cast(out.ptr() + idx)) = *(reinterpret_cast(in.ptr())); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + + id[3] * perm_strides[3]; + *(reinterpret_cast(out.ptr() + idx)) = *(reinterpret_cast(in.ptr())); + }, + in, out); } } -CPPPermuteKernel::CPPPermuteKernel() - : _func(), _input(nullptr), _output(nullptr), _perm() +CPPPermuteKernel::CPPPermuteKernel() : _func(), _input(nullptr), _output(nullptr), _perm() { } @@ -113,7 +118,7 @@ void CPPPermuteKernel::configure(const ITensor *input, ITensor *output, const Pe _output = output; _perm = perm; - switch(input->info()->element_size()) + switch (input->info()->element_size()) { case 1: _func = &CPPPermuteKernel::run_permute; @@ -152,7 +157,7 @@ void CPPPermuteKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - if(_func != nullptr) + if (_func != nullptr) { (this->*_func)(window); } diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp index d2b54e412e..6ffb68e770 100644 --- a/src/core/CPP/kernels/CPPTopKVKernel.cpp +++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp @@ -34,32 +34,34 @@ namespace arm_compute { namespace { -template ::value, int>::type = 0> +template ::value, int>::type = 0> inline bool greater_than(T a, T b) { const T epsilon = std::numeric_limits::epsilon(); return (a - b > epsilon); } -template < typename T, - typename std::enable_if < !utils::traits::is_floating_point::value, int >::type = 0 > +template ::value, int>::type = 0> inline bool greater_than(T a, T b) { return (a > b); } -Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k) +Status validate_arguments(const ITensorInfo *predictions, + const ITensorInfo *targets, + ITensorInfo *output, + const unsigned int k) { ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(targets, 1, DataType::U32); ARM_COMPUTE_RETURN_ERROR_ON(predictions->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(targets->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(targets->dimension(0) != predictions->dimension(1)); // Validate configured output - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), targets->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); @@ -72,22 +74,23 @@ Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *tar template void CPPTopKVKernel::run_topkv() { - for(unsigned int i = 0; i < _batch_size; ++i) + for (unsigned int i = 0; i < _batch_size; ++i) { - const auto target_class_id = *reinterpret_cast(_targets->ptr_to_element(Coordinates{ i })); - const auto predicted_value = *reinterpret_cast(_predictions->ptr_to_element(Coordinates{ target_class_id, i })); + const auto target_class_id = *reinterpret_cast(_targets->ptr_to_element(Coordinates{i})); + const auto predicted_value = + *reinterpret_cast(_predictions->ptr_to_element(Coordinates{target_class_id, i})); // The variable rank indicates how many values there are before the target_class_id unsigned int rank = 0; - for(unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j) + for (unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j) { - const auto current_prediction = *reinterpret_cast(_predictions->ptr_to_element(Coordinates{ j, i })); - if(greater_than(current_prediction, predicted_value)) + const auto current_prediction = *reinterpret_cast(_predictions->ptr_to_element(Coordinates{j, i})); + if (greater_than(current_prediction, predicted_value)) { rank++; } } - *(_output->ptr_to_element(Coordinates{ i })) = static_cast(rank < _k); + *(_output->ptr_to_element(Coordinates{i})) = static_cast(rank < _k); } } @@ -96,7 +99,10 @@ CPPTopKVKernel::CPPTopKVKernel() { } -void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k) +void CPPTopKVKernel::configure(const ITensor *predictions, + const ITensor *targets, + ITensor *output, + const unsigned int k) { ARM_COMPUTE_ERROR_ON_NULLPTR(predictions, targets, output); @@ -115,7 +121,10 @@ void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *target ICPPKernel::configure(Window()); // Default 1 iteration window } -Status CPPTopKVKernel::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k) +Status CPPTopKVKernel::validate(const ITensorInfo *predictions, + const ITensorInfo *targets, + ITensorInfo *output, + const unsigned int k) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(predictions, targets, output, k)); return Status{}; @@ -129,7 +138,7 @@ bool CPPTopKVKernel::is_parallelisable() const void CPPTopKVKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(window, info); - switch(_predictions->info()->data_type()) + switch (_predictions->info()->data_type()) { case DataType::F32: run_topkv(); diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp index 7ef83fb2c4..b1efe32446 100644 --- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp +++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h" #include "arm_compute/core/Helpers.h" + #include "src/core/helpers/WindowHelpers.h" #include @@ -31,8 +32,7 @@ namespace arm_compute { -CPPUpsampleKernel::CPPUpsampleKernel() - : _input(nullptr), _output(nullptr), _info() +CPPUpsampleKernel::CPPUpsampleKernel() : _input(nullptr), _output(nullptr), _info() { } @@ -82,7 +82,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info) const size_t element_size = _input->info()->element_size(); // The fill value is normally 0, but for quantized types '0' corresponds to the offset - switch(_output->info()->data_type()) + switch (_output->info()->data_type()) { case DataType::QASYMM8: { @@ -102,7 +102,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info) // Create window Window window_out(window); - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { window_out.set(Window::DimX, Window::Dimension(start_width, end_width, stride_width)); window_out.set(Window::DimY, Window::Dimension(start_height, end_height, stride_height)); @@ -117,10 +117,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info) Iterator in(_input, window); Iterator out(_output, window_out); - execute_window_loop(window, [&](const Coordinates &) - { - memcpy(out.ptr(), in.ptr(), element_size); - }, - in, out); + execute_window_loop( + window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/Error.cpp b/src/core/Error.cpp index 5c8d45c987..679a93f9af 100644 --- a/src/core/Error.cpp +++ b/src/core/Error.cpp @@ -36,9 +36,10 @@ Status arm_compute::create_error(ErrorCode error_code, std::string msg) return Status(error_code, msg); } -Status arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg) +Status +arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg) { - std::array out{ 0 }; + std::array out{0}; snprintf(out.data(), out.size(), "in %s %s:%d: %s", func, file, line, msg); return Status(error_code, std::string(out.data())); } diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp index 292acf8633..2d1a13cb33 100644 --- a/src/core/GPUTarget.cpp +++ b/src/core/GPUTarget.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/GPUTarget.h" + #include "arm_compute/core/Log.h" #include @@ -31,47 +32,47 @@ namespace { arm_compute::GPUTarget get_valhall_target(const std::string &version) { - if(version.find("G77") != std::string::npos) + if (version.find("G77") != std::string::npos) { return arm_compute::GPUTarget::G77; } - else if(version.find("G57") != std::string::npos) + else if (version.find("G57") != std::string::npos) { return arm_compute::GPUTarget::G57; } - if(version.find("G68") != std::string::npos) + if (version.find("G68") != std::string::npos) { return arm_compute::GPUTarget::G68; } - if(version.find("G78AE") != std::string::npos) + if (version.find("G78AE") != std::string::npos) { return arm_compute::GPUTarget::G78AE; } - if(version.find("G78") != std::string::npos) + if (version.find("G78") != std::string::npos) { return arm_compute::GPUTarget::G78; } - else if(version.find("G710") != std::string::npos) + else if (version.find("G710") != std::string::npos) { return arm_compute::GPUTarget::G710; } - else if(version.find("G610") != std::string::npos) + else if (version.find("G610") != std::string::npos) { return arm_compute::GPUTarget::G610; } - else if(version.find("G510") != std::string::npos) + else if (version.find("G510") != std::string::npos) { return arm_compute::GPUTarget::G510; } - else if(version.find("G310") != std::string::npos) + else if (version.find("G310") != std::string::npos) { return arm_compute::GPUTarget::G310; } - else if(version.find("G715") != std::string::npos) + else if (version.find("G715") != std::string::npos) { return arm_compute::GPUTarget::G715; } - else if(version.find("G615") != std::string::npos) + else if (version.find("G615") != std::string::npos) { return arm_compute::GPUTarget::G615; } @@ -83,39 +84,39 @@ arm_compute::GPUTarget get_valhall_target(const std::string &version) arm_compute::GPUTarget get_bifrost_target(const std::string &version) { - if(version.find("G71") != std::string::npos) + if (version.find("G71") != std::string::npos) { return arm_compute::GPUTarget::G71; } - else if(version.find("G72") != std::string::npos) + else if (version.find("G72") != std::string::npos) { return arm_compute::GPUTarget::G72; } - else if(version.find("G51BIG") != std::string::npos) + else if (version.find("G51BIG") != std::string::npos) { return arm_compute::GPUTarget::G51BIG; } - else if(version.find("G51LIT") != std::string::npos) + else if (version.find("G51LIT") != std::string::npos) { return arm_compute::GPUTarget::G51LIT; } - else if(version.find("G51") != std::string::npos) + else if (version.find("G51") != std::string::npos) { return arm_compute::GPUTarget::G51; } - else if(version.find("G52LIT") != std::string::npos) + else if (version.find("G52LIT") != std::string::npos) { return arm_compute::GPUTarget::G52LIT; } - else if(version.find("G52") != std::string::npos) + else if (version.find("G52") != std::string::npos) { return arm_compute::GPUTarget::G52; } - else if(version.find("G76") != std::string::npos) + else if (version.find("G76") != std::string::npos) { return arm_compute::GPUTarget::G76; } - else if(version.find("G31") != std::string::npos) + else if (version.find("G31") != std::string::npos) { return arm_compute::GPUTarget::G31; } @@ -127,15 +128,15 @@ arm_compute::GPUTarget get_bifrost_target(const std::string &version) arm_compute::GPUTarget get_midgard_target(const std::string &version) { - if(version.find("T600") != std::string::npos) + if (version.find("T600") != std::string::npos) { return arm_compute::GPUTarget::T600; } - else if(version.find("T700") != std::string::npos) + else if (version.find("T700") != std::string::npos) { return arm_compute::GPUTarget::T700; } - else if(version.find("T800") != std::string::npos) + else if (version.find("T800") != std::string::npos) { return arm_compute::GPUTarget::T800; } @@ -150,34 +151,16 @@ namespace arm_compute { const std::string &string_from_target(GPUTarget target) { - static std::map gpu_target_map = - { - { GPUTarget::MIDGARD, "midgard" }, - { GPUTarget::BIFROST, "bifrost" }, - { GPUTarget::VALHALL, "valhall" }, - { GPUTarget::T600, "t600" }, - { GPUTarget::T700, "t700" }, - { GPUTarget::T800, "t800" }, - { GPUTarget::G71, "g71" }, - { GPUTarget::G72, "g72" }, - { GPUTarget::G51, "g51" }, - { GPUTarget::G51BIG, "g51big" }, - { GPUTarget::G51LIT, "g51lit" }, - { GPUTarget::G31, "g31" }, - { GPUTarget::G76, "g76" }, - { GPUTarget::G52, "g52" }, - { GPUTarget::G52LIT, "g52lit" }, - { GPUTarget::G77, "g77" }, - { GPUTarget::G57, "g57" }, - { GPUTarget::G78, "g78" }, - { GPUTarget::G68, "g68" }, - { GPUTarget::G78AE, "g78ae" }, - { GPUTarget::G710, "g710" }, - { GPUTarget::G610, "g610" }, - { GPUTarget::G510, "g510" }, - { GPUTarget::G310, "g310" }, - { GPUTarget::G715, "g715" }, - { GPUTarget::G615, "g615" }, + static std::map gpu_target_map = { + {GPUTarget::MIDGARD, "midgard"}, {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"}, + {GPUTarget::T600, "t600"}, {GPUTarget::T700, "t700"}, {GPUTarget::T800, "t800"}, + {GPUTarget::G71, "g71"}, {GPUTarget::G72, "g72"}, {GPUTarget::G51, "g51"}, + {GPUTarget::G51BIG, "g51big"}, {GPUTarget::G51LIT, "g51lit"}, {GPUTarget::G31, "g31"}, + {GPUTarget::G76, "g76"}, {GPUTarget::G52, "g52"}, {GPUTarget::G52LIT, "g52lit"}, + {GPUTarget::G77, "g77"}, {GPUTarget::G57, "g57"}, {GPUTarget::G78, "g78"}, + {GPUTarget::G68, "g68"}, {GPUTarget::G78AE, "g78ae"}, {GPUTarget::G710, "g710"}, + {GPUTarget::G610, "g610"}, {GPUTarget::G510, "g510"}, {GPUTarget::G310, "g310"}, + {GPUTarget::G715, "g715"}, {GPUTarget::G615, "g615"}, }; return gpu_target_map[target]; @@ -189,7 +172,7 @@ GPUTarget get_target_from_name(const std::string &device_name) std::smatch name_parts; const bool found_mali = std::regex_search(device_name, name_parts, mali_regex); - if(!found_mali) + if (!found_mali) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Arm® Maliâ„¢ GPU. Target is set to default."); return GPUTarget::MIDGARD; @@ -203,22 +186,22 @@ GPUTarget get_target_from_name(const std::string &device_name) // Work-out gpu target GPUTarget gpu_target; - if(target == 'G' || is_future_gpu) + if (target == 'G' || is_future_gpu) { // Check for Valhall or Bifrost gpu_target = get_valhall_target(version); - if(gpu_target == GPUTarget::UNKNOWN) + if (gpu_target == GPUTarget::UNKNOWN) { gpu_target = get_bifrost_target(version); } // Default GPUTarget - if(gpu_target == GPUTarget::UNKNOWN) + if (gpu_target == GPUTarget::UNKNOWN) { gpu_target = GPUTarget::VALHALL; } } - else if(target == 'T') + else if (target == 'T') { gpu_target = get_midgard_target(version); } @@ -228,7 +211,7 @@ GPUTarget get_target_from_name(const std::string &device_name) } // Report in case of unknown target - if(gpu_target == GPUTarget::UNKNOWN) + if (gpu_target == GPUTarget::UNKNOWN) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Arm® Maliâ„¢ Mali GPU unknown. Target is set to the default one. (BIFROST)"); return GPUTarget::BIFROST; diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp index 28e7f4c1e5..c801b097b5 100644 --- a/src/core/Helpers.cpp +++ b/src/core/Helpers.cpp @@ -25,8 +25,11 @@ namespace arm_compute { -ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape, - InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined) +ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, + const TensorShape &dst_shape, + InterpolationPolicy interpolate_policy, + SamplingPolicy sampling_policy, + bool border_undefined) { const DataLayout data_layout = src_info.data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -49,9 +52,9 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens auto valid_end_out_y = std::min(std::ceil(valid_end_in_y * scale_y), dst_shape[idx_height]); // Handle valid points in case of the bi-linear interpolation - if(border_undefined) + if (border_undefined) { - switch(interpolate_policy) + switch (interpolate_policy) { case InterpolationPolicy::NEAREST_NEIGHBOR: { @@ -90,7 +93,7 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens } // Setup output valid region - ValidRegion valid_region{ Coordinates(), dst_shape, dst_shape.num_dimensions() }; + ValidRegion valid_region{Coordinates(), dst_shape, dst_shape.num_dimensions()}; valid_region.anchor.set(idx_width, std::max(0, valid_start_out_x)); valid_region.anchor.set(idx_height, std::max(0, valid_start_out_y)); @@ -109,14 +112,12 @@ const std::map> &get_layout_map() constexpr DataLayoutDimension D = DataLayoutDimension::DEPTH; constexpr DataLayoutDimension N = DataLayoutDimension::BATCHES; - static const std::map> layout_map = - { - { DataLayout::NDHWC, { C, W, H, D, N } }, - { DataLayout::NCDHW, { W, H, D, C, N } }, - { DataLayout::NHWC, { C, W, H, N } }, - { DataLayout::NCHW, { W, H, C, N } } - }; + static const std::map> layout_map = { + {DataLayout::NDHWC, {C, W, H, D, N}}, + {DataLayout::NCDHW, {W, H, D, C, N}}, + {DataLayout::NHWC, {C, W, H, N}}, + {DataLayout::NCHW, {W, H, C, N}}}; return layout_map; } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp index 832801255f..923c5f8a85 100644 --- a/src/core/IAccessWindow.cpp +++ b/src/core/IAccessWindow.cpp @@ -29,14 +29,18 @@ using namespace arm_compute; -ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const +ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, + const ValidRegion &input_valid_region) const { return compute_valid_region(window, input_valid_region, false, BorderSize(0)); } -ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const +ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const { - if(_info == nullptr) + if (_info == nullptr) { return input_valid_region; } @@ -45,7 +49,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va Coordinates old_anchor(anchor); TensorShape &shape = input_valid_region.shape; - if(!border_undefined) + if (!border_undefined) { border_size = BorderSize(0); } @@ -56,7 +60,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va // Additionally the valid region is shifted by the offset that is used by // the kernel to write back output values. anchor.set(0, std::max(window.x().start() * _scale_x, anchor[0] + border_size.left) + _x); - if(_info->num_dimensions() > 1) + if (_info->num_dimensions() > 1) { anchor.set(1, std::max(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y); } @@ -69,15 +73,19 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va // old size is first converted into end points to compared against the // execution window. Afterwards the new end points are converted back into // a size of the region. - shape.set(0, std::min(old_anchor[0] + shape[0] - border_size.right, (window.x().end() - window.x().step()) * _scale_x + _width) - anchor[0]); - if(_info->num_dimensions() > 1) + shape.set(0, std::min(old_anchor[0] + shape[0] - border_size.right, + (window.x().end() - window.x().step()) * _scale_x + _width) - + anchor[0]); + if (_info->num_dimensions() > 1) { - shape.set(1, std::min(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]); + shape.set(1, std::min(old_anchor[1] + shape[1] - border_size.bottom, + (window.y().end() - window.y().step()) * _scale_y + _height) - + anchor[1]); } // For higher dimensions use the intersection of the window size and the // valid region of the input - for(size_t d = 2; d < _info->num_dimensions(); ++d) + for (size_t d = 2; d < _info->num_dimensions(); ++d) { anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d])); shape.set(d, std::min(window[d].end(), input_valid_region.shape[d]) - anchor[d]); @@ -86,9 +94,12 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va return input_valid_region; } -void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined, const BorderSize &border_size) +void AccessWindowRectangle::set_valid_region(const Window &window, + const ValidRegion &input_valid_region, + bool border_undefined, + const BorderSize &border_size) { - if(_info != nullptr) + if (_info != nullptr) { _info->set_valid_region(compute_valid_region(window, input_valid_region, border_undefined, border_size)); } @@ -97,17 +108,16 @@ void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRe bool AccessWindowRectangle::update_window_if_needed(Window &window) const { // Only update the window size if we can't use padding - if(_info == nullptr || _info->is_resizable()) + if (_info == nullptr || _info->is_resizable()) { return false; } - PaddingSize needed = get_needed_padding(window); + PaddingSize needed = get_needed_padding(window); PaddingSize available = _info->padding(); - if(needed.top <= available.top && needed.right <= available.right - && needed.bottom <= available.bottom - && needed.left <= available.left) + if (needed.top <= available.top && needed.right <= available.right && needed.bottom <= available.bottom && + needed.left <= available.left) { return false; } @@ -124,12 +134,12 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const const int max_y = (window.y().end() - window.y().step()) * _scale_y + _y + _height; // Adjust window start for Y dimension - if(min_y < 0) + if (min_y < 0) { // Calculate rows available above the tensor const int front_pad_y_available = -static_cast(offset_first_element / strides[1]); - if(min_y < front_pad_y_available) + if (min_y < front_pad_y_available) { // Not enough padding available, need to shrink the window int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y; @@ -144,18 +154,19 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const } // Adjust window end for Y dimension - if(max_y > static_cast(shape[1])) + if (max_y > static_cast(shape[1])) { const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size(); // Calculate rows available below the tensor const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y; - if(static_cast(shape[1]) + tail_pad_y_available < max_y) + if (static_cast(shape[1]) + tail_pad_y_available < max_y) { // Not enough padding available, need to shrink the window - int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height; - end = std::max(window.y().start(), end / _scale_y); + int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + + window.y().step() * _scale_y - _y - _height; + end = std::max(window.y().start(), end / _scale_y); window.set(1, Window::Dimension(window.y().start(), end, window.y().step())); window_modified = true; @@ -170,11 +181,14 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size(); // Adjust window start for X dimension - if(min_x < 0) + if (min_x < 0) { - const int front_pad_x_available = -std::min(static_cast(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast(strides[0]); + const int front_pad_x_available = + -std::min(static_cast(offset_first_element) - front_pad_y * strides[1], + stride_y - shape[0] * strides[0]) / + static_cast(strides[0]); - if(min_x < front_pad_x_available) + if (min_x < front_pad_x_available) { // Not enough padding available, need to shrink the window int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x; @@ -189,15 +203,16 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const } // Adjust window end for X dimension - if(max_x > static_cast(shape[0])) + if (max_x > static_cast(shape[0])) { const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x; - if(static_cast(shape[0]) + tail_pad_x_available < max_x) + if (static_cast(shape[0]) + tail_pad_x_available < max_x) { // Not enough padding available, need to shrink the window - int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width; - end = std::max(window.x().start(), end / _scale_x); + int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + + window.x().step() * _scale_x - _x - _width; + end = std::max(window.x().start(), end / _scale_x); window.set(0, Window::Dimension(window.x().start(), end, window.x().step())); window_modified = true; @@ -212,15 +227,15 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const bool AccessWindowRectangle::update_padding_if_needed(const Window &window) { // Only update the padding if the tensor allows it - if(_info == nullptr || !_info->is_resizable()) + if (_info == nullptr || !_info->is_resizable()) { return false; } // Update strides in tensor info - return _info->extend_padding( get_needed_padding(window)); + return _info->extend_padding(get_needed_padding(window)); } -PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window)const +PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window) const { ARM_COMPUTE_ERROR_ON(_scale_x == 0); ARM_COMPUTE_ERROR_ON(_scale_y == 0); diff --git a/src/core/IKernel.cpp b/src/core/IKernel.cpp index 31f1ec7a3f..fb7e095091 100644 --- a/src/core/IKernel.cpp +++ b/src/core/IKernel.cpp @@ -30,8 +30,7 @@ const Window &IKernel::window() const return _window; } -IKernel::IKernel() - : _window() +IKernel::IKernel() : _window() { // Create an empty window to make sure the children classes set the window values themselves _window.set(Window::DimX, Window::Dimension(0, 0, 1)); diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp index 2f4354cc6f..4dc8ea959b 100644 --- a/src/core/ITensor.cpp +++ b/src/core/ITensor.cpp @@ -35,7 +35,7 @@ namespace arm_compute { void ITensor::copy_from(const ITensor &src) { - if(&src == this) + if (&src == this) { return; } @@ -47,7 +47,7 @@ void ITensor::copy_from(const ITensor &src) ARM_COMPUTE_ERROR_ON(src_info->num_channels() != dst_info->num_channels()); ARM_COMPUTE_ERROR_ON(src_info->element_size() != dst_info->element_size()); - for(size_t d = 0; d < src_info->num_dimensions(); d++) + for (size_t d = 0; d < src_info->num_dimensions(); d++) { ARM_COMPUTE_ERROR_ON(src_info->dimension(d) > dst_info->dimension(d)); } @@ -66,11 +66,7 @@ void ITensor::copy_from(const ITensor &src) const size_t line_size = src_info->element_size() * src_info->dimension(0); execute_window_loop( - win_src, [&](const Coordinates &) - { - memcpy(dst_it.ptr(), src_it.ptr(), line_size); - }, - src_it, dst_it); + win_src, [&](const Coordinates &) { memcpy(dst_it.ptr(), src_it.ptr(), line_size); }, src_it, dst_it); } #ifdef ARM_COMPUTE_ASSERTS_ENABLED @@ -87,10 +83,10 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const stream_status.copyfmt(s); // Set precision - if(is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default)) + if (is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default)) { int precision = io_fmt.precision; - if(io_fmt.precision_type == IOFormatInfo::PrecisionType::Full) + if (io_fmt.precision_type == IOFormatInfo::PrecisionType::Full) { precision = std::numeric_limits().max_digits10; } @@ -101,7 +97,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const size_t print_width = 0; size_t print_height = 0; int start_offset = 0; - switch(io_fmt.print_region) + switch (io_fmt.print_region) { case IOFormatInfo::PrintRegion::NoPadding: print_width = this->info()->dimension(0); @@ -111,13 +107,14 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const case IOFormatInfo::PrintRegion::ValidRegion: print_width = this->info()->valid_region().shape.x(); print_height = this->info()->valid_region().shape.y(); - start_offset = this->info()->offset_element_in_bytes(Coordinates(this->info()->valid_region().anchor.x(), - this->info()->valid_region().anchor.y())); + start_offset = this->info()->offset_element_in_bytes( + Coordinates(this->info()->valid_region().anchor.x(), this->info()->valid_region().anchor.y())); break; case IOFormatInfo::PrintRegion::Full: print_width = padding.left + this->info()->dimension(0) + padding.right; print_height = padding.top + this->info()->dimension(1) + padding.bottom; - start_offset = static_cast(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] - padding.left * strides[0]; + start_offset = static_cast(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] - + padding.left * strides[0]; break; default: break; @@ -129,16 +126,17 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const const uint8_t *ptr = this->buffer() + start_offset; // Start printing - for(size_t i = 0; i < slices2D; ++i) + for (size_t i = 0; i < slices2D; ++i) { // Find max_width of elements in slice to align columns int max_element_width = 0; - if(io_fmt.align_columns) + if (io_fmt.align_columns) { size_t offset = i * strides[2]; - for(size_t h = 0; h < print_height; ++h) + for (size_t h = 0; h < print_height; ++h) { - max_element_width = std::max(max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width)); + max_element_width = std::max( + max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width)); offset += strides[1]; } } @@ -146,7 +144,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const // Print slice { size_t offset = i * strides[2]; - for(size_t h = 0; h < print_height; ++h) + for (size_t h = 0; h < print_height; ++h) { print_consecutive_elements(s, dt, ptr + offset, print_width, max_element_width, io_fmt.element_delim); offset += strides[1]; diff --git a/src/core/ITensorPack.cpp b/src/core/ITensorPack.cpp index 90f9a45039..0f8b0824f8 100644 --- a/src/core/ITensorPack.cpp +++ b/src/core/ITensorPack.cpp @@ -27,10 +27,9 @@ namespace arm_compute { -ITensorPack::ITensorPack(std::initializer_list l) - : _pack() +ITensorPack::ITensorPack(std::initializer_list l) : _pack() { - for(auto &e : l) + for (auto &e : l) { _pack[e.id] = e; } @@ -54,7 +53,7 @@ void ITensorPack::add_const_tensor(int id, const ITensor *tensor) const ITensor *ITensorPack::get_const_tensor(int id) const { auto it = _pack.find(id); - if(it != _pack.end()) + if (it != _pack.end()) { return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor; } @@ -81,4 +80,4 @@ bool ITensorPack::empty() const { return _pack.empty(); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h index e6d0e532c8..5f4d08d0f6 100644 --- a/src/core/NEON/NEAsymm.h +++ b/src/core/NEON/NEAsymm.h @@ -26,6 +26,7 @@ #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include namespace arm_compute @@ -90,7 +91,7 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, { const static int32x4_t zero_s32 = vdupq_n_s32(0); - if(result_shift < 0) + if (result_shift < 0) { in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift))); in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift))); @@ -130,18 +131,13 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to U8 uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_u8 = vmaxq_u8(out_u8, min_u8); out_u8 = vminq_u8(out_u8, max_u8); @@ -170,7 +166,7 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32, int8x16_t max_s8, bool is_bounded_relu) { - if(result_shift < 0) + if (result_shift < 0) { in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift))); in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift))); @@ -204,18 +200,13 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32, in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to S8 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_s8 = vmaxq_s8(out_s8, min_s8); out_s8 = vminq_s8(out_s8, max_s8); @@ -247,8 +238,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, const static int32x4_t one_s32 = vdupq_n_s32(1); // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - int32x4x4_t res_shift_gt0 = - { + int32x4x4_t res_shift_gt0 = { vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]), vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]), vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]), @@ -260,8 +250,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]); res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]); - int32x4x4_t res_shift_lt0 = - { + int32x4x4_t res_shift_lt0 = { vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))), vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))), vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))), @@ -273,8 +262,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]); // Select result depending on shift value - const uint32x4x4_t mask_lt0 = - { + const uint32x4x4_t mask_lt0 = { #ifdef __aarch64__ vcltzq_s32(result_shift.val[0]), vcltzq_s32(result_shift.val[1]), @@ -300,18 +288,13 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to S8 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_s8 = vmaxq_s8(out_s8, min_s8); out_s8 = vminq_s8(out_s8, max_s8); @@ -332,15 +315,20 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, * * @return Quantized value */ -inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, - int32_t result_shift, int32_t result_offset_after_shift_s32, - uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu) +inline uint8_t finalize_quantization(int32_t in_value, + int result_fixedpoint_multiplier, + int32_t result_shift, + int32_t result_offset_after_shift_s32, + uint8_t min_u8, + uint8_t max_u8, + bool is_bounded_relu) { int32x4_t in_s32 = vdupq_n_s32(in_value); - if(result_shift < 0) + if (result_shift < 0) { - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); + in_value = vgetq_lane_s32( + vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); } else { @@ -355,7 +343,7 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul // Bound the result uint8_t out_u8 = static_cast(std::max(0, std::min(255, in_value))); - if(is_bounded_relu) + if (is_bounded_relu) { out_u8 = static_cast(std::max(min_u8, std::min(max_u8, out_u8))); } @@ -375,15 +363,20 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul * * @return Quantized value */ -inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, - int32_t result_shift, int32_t result_offset_after_shift_s32, - int8_t min_s8, int8_t max_s8, bool is_bounded_relu) +inline int8_t finalize_quantization(int32_t in_value, + int result_fixedpoint_multiplier, + int32_t result_shift, + int32_t result_offset_after_shift_s32, + int8_t min_s8, + int8_t max_s8, + bool is_bounded_relu) { int32x4_t in_s32 = vdupq_n_s32(in_value); - if(result_shift < 0) + if (result_shift < 0) { - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); + in_value = vgetq_lane_s32( + vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); } else { @@ -399,7 +392,7 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult // Bound the result int8_t out_s8 = static_cast(std::max(-128, std::min(127, in_value))); - if(is_bounded_relu) + if (is_bounded_relu) { out_s8 = static_cast(std::max(min_s8, std::min(max_s8, out_s8))); } @@ -416,17 +409,16 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult */ inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x2_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale), - } - }; + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x2_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), + vscale), + }}; return vdequantized_input; } @@ -439,17 +431,14 @@ inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationI */ inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x2_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale), - } - }; + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x2_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale), + }}; return vdequantized_input; } @@ -462,19 +451,24 @@ inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationIn */ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - } - }; + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), + vscale), + }}; return vdequantized_input; } @@ -487,19 +481,16 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantization */ inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - } - }; + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + }}; return vdequantized_input; } @@ -513,17 +504,22 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationI */ inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset) { - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - } - }; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), + vscale), + }}; return vdequantized_input; } @@ -537,17 +533,14 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offs */ inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset) { - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - } - }; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + }}; return vdequantized_input; } @@ -560,15 +553,12 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offse */ inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale) { - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]), - } - }; + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]), + }}; return vdequantized_input; } @@ -581,16 +571,13 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale */ inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale) { - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale), - } - }; + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale), + }}; return vdequantized_input; } @@ -607,18 +594,15 @@ inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInf const int offset = qi.offset; const float32x4_t voffset = vdupq_n_f32(offset); const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { + const int32x4x4_t rf = {{ #ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), #else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), #endif //__aarch64__ - } - }; + }}; return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); } @@ -635,18 +619,15 @@ inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizat const int offset = qi.offset; const float32x4_t voffset = vdupq_n_f32(offset); const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { + const int32x4x4_t rf = {{ #ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), #else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), #endif //__aarch64__ - } - }; + }}; return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); } @@ -654,22 +635,19 @@ inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int3 { const int32x4_t voffset = vdupq_n_s32(offset); const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { + const int32x4x4_t rf = {{ #ifdef __aarch64__ - vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset), - vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset), - vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset), - vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset), #else //__aarch64__ - vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset), - vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset), - vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset), - vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset), #endif //__aarch64__ - } - }; + }}; return rf; } @@ -715,7 +693,7 @@ inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQua auto rf = vquantize_internal(qv, qi.scale, qi.offset); const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1])); const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3])); - return { pa, pb }; + return {pa, pb}; } } // namespace arm_compute diff --git a/src/core/NEON/NEAsymm.inl b/src/core/NEON/NEAsymm.inl index ca2aea1e18..fd62fd4654 100644 --- a/src/core/NEON/NEAsymm.inl +++ b/src/core/NEON/NEAsymm.inl @@ -51,14 +51,14 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v D_f32x4 = vmlaq_f32(vo, D_f32x4, vs); // Convert float32 vectors to uint32 vectors #if __aarch64__ - if(round_policy == RoundingPolicy::TO_NEAREST_EVEN) + if (round_policy == RoundingPolicy::TO_NEAREST_EVEN) { A_u32x4 = vcvtnq_u32_f32(A_f32x4); B_u32x4 = vcvtnq_u32_f32(B_f32x4); C_u32x4 = vcvtnq_u32_f32(C_f32x4); D_u32x4 = vcvtnq_u32_f32(D_f32x4); } - else if(round_policy == RoundingPolicy::TO_NEAREST_UP) + else if (round_policy == RoundingPolicy::TO_NEAREST_UP) { A_u32x4 = vcvtaq_u32_f32(A_f32x4); B_u32x4 = vcvtaq_u32_f32(B_f32x4); @@ -86,7 +86,7 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8)); } -template +template inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo) { // Convert uint8 vectors to int16 vectors @@ -110,14 +110,14 @@ inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x C_f32x4 = vmlaq_f32(vo, C_f32x4, vs); D_f32x4 = vmlaq_f32(vo, D_f32x4, vs); #if __aarch64__ - if(round_policy == RoundingPolicy::TO_NEAREST_EVEN) + if (round_policy == RoundingPolicy::TO_NEAREST_EVEN) { A_s32x4 = vcvtnq_s32_f32(A_f32x4); B_s32x4 = vcvtnq_s32_f32(B_f32x4); C_s32x4 = vcvtnq_s32_f32(C_f32x4); D_s32x4 = vcvtnq_s32_f32(D_f32x4); } - else if(round_policy == RoundingPolicy::TO_NEAREST_UP) + else if (round_policy == RoundingPolicy::TO_NEAREST_UP) { A_s32x4 = vcvtaq_s32_f32(A_f32x4); B_s32x4 = vcvtaq_s32_f32(B_f32x4); diff --git a/src/core/NEON/NEFixedPoint.inl b/src/core/NEON/NEFixedPoint.inl index 8bff9c4a8e..fb403b6d26 100644 --- a/src/core/NEON/NEFixedPoint.inl +++ b/src/core/NEON/NEFixedPoint.inl @@ -30,13 +30,7 @@ namespace arm_compute inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b) { - float32x4x2_t res = - { - { - vmaxq_f32(a.val[0], b.val[0]), - vmaxq_f32(a.val[1], b.val[1]) - } - }; + float32x4x2_t res = {{vmaxq_f32(a.val[0], b.val[0]), vmaxq_f32(a.val[1], b.val[1])}}; return res; } #endif /* DOXYGEN_SKIP_THIS */ diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl index 1cbe669373..f875917988 100644 --- a/src/core/NEON/NEMath.inl +++ b/src/core/NEON/NEMath.inl @@ -29,19 +29,16 @@ namespace arm_compute { /** Logarithm polynomial coefficients */ -const std::array log_tab = -{ - { - vdupq_n_f32(-2.29561495781f), - vdupq_n_f32(-2.47071170807f), - vdupq_n_f32(-5.68692588806f), - vdupq_n_f32(-0.165253549814f), - vdupq_n_f32(5.17591238022f), - vdupq_n_f32(0.844007015228f), - vdupq_n_f32(4.58445882797f), - vdupq_n_f32(0.0141278216615f), - } -}; +const std::array log_tab = {{ + vdupq_n_f32(-2.29561495781f), + vdupq_n_f32(-2.47071170807f), + vdupq_n_f32(-5.68692588806f), + vdupq_n_f32(-0.165253549814f), + vdupq_n_f32(5.17591238022f), + vdupq_n_f32(0.844007015228f), + vdupq_n_f32(4.58445882797f), + vdupq_n_f32(0.0141278216615f), +}}; /** Sin polynomial coefficients */ constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3) @@ -54,7 +51,7 @@ inline float32x4_t prefer_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { #if __ARM_FEATURE_FMA return vfmaq_f32(a, b, c); -#else // __ARM_FEATURE_FMA +#else // __ARM_FEATURE_FMA return vmlaq_f32(a, b, c); #endif // __ARM_FEATURE_FMA } @@ -73,13 +70,14 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val) { #ifdef __aarch64__ return vrndnq_f32(val); -#else // __aarch64__ +#else // __aarch64__ static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f); static const float32x4_t CONST_1_FLOAT = vdupq_n_f32(1.f); static const int32x4_t CONST_1_INT = vdupq_n_s32(1); const float32x4_t floor_val = vfloorq_f32(val); const float32x4_t diff = vsubq_f32(val, floor_val); - const float32x4_t fp32_upper_limit = vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U + const float32x4_t fp32_upper_limit = + vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U /* * 1. Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0). @@ -95,12 +93,13 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val) * Threshold upper limit with format |S|E(8bits)| Fraction(23bits) | = (23 + 127) << 23 (assuming positive sign): Adding 127, because 127 represents the actual zero in this format. */ - float32x4_t rounded_val = vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), - vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), - vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))), - floor_val, vaddq_f32(floor_val, CONST_1_FLOAT)); + float32x4_t rounded_val = vbslq_f32( + vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), + vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), + vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))), + floor_val, vaddq_f32(floor_val, CONST_1_FLOAT)); - float32x4_t result = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val); + float32x4_t result = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val); return result; #endif // __aarch64__ @@ -118,8 +117,8 @@ inline float32x2_t vinvsqrt_f32(float32x2_t x) inline float32x4_t vinvsqrtq_f32(float32x4_t x) { float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); - sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); return sqrt_reciprocal; } @@ -152,8 +151,7 @@ inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array::infinity()); const auto max_input = vdupq_n_f32(88.37f); // Approximately ln(2^127.5) @@ -224,9 +224,9 @@ inline float32x4_t vexpq_f32(float32x4_t x) #ifdef __aarch64__ inline float32x4_t verfq_f32(float32x4_t x) { - static const float erffdata[4] = { 0.278393f, 0.230389f, 0.000972f, 0.078108f }; + static const float erffdata[4] = {0.278393f, 0.230389f, 0.000972f, 0.078108f}; static const float32x4_t coeffdata = vld1q_f32(erffdata); - static const float32x4_t onev{ vdupq_n_f32(1.0f) }; + static const float32x4_t onev{vdupq_n_f32(1.0f)}; uint32x4_t selector = vcltzq_f32(x); @@ -287,10 +287,12 @@ inline float32x4_t vtanhq_f32(float32x4_t val) float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH); // x * (1 - x^2/3) if |x| < 5.e-3 or (exp2x - 1) / (exp2x + 1) otherwise - float32x4_t exp2x = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x)); - float32x4_t num = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x)); - float32x4_t den = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num)); - float32x4_t tanh = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den)); + float32x4_t exp2x = + vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x)); + float32x4_t num = + vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x)); + float32x4_t den = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num)); + float32x4_t tanh = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den)); return tanh; } @@ -456,30 +458,23 @@ inline float32x4x4_t convert_to_float32x4x4(const int8x16_t &in) inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out) { - out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), - vqmovn_u32(vcvtq_u32_f32(in2.val[0])))); - out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), - vqmovn_u32(vcvtq_u32_f32(in2.val[1])))); - out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), - vqmovn_u32(vcvtq_u32_f32(in2.val[2])))); + out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), vqmovn_u32(vcvtq_u32_f32(in2.val[0])))); + out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), vqmovn_u32(vcvtq_u32_f32(in2.val[1])))); + out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), vqmovn_u32(vcvtq_u32_f32(in2.val[2])))); } inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out) { - const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), - vqmovn_u32(vcvtq_u32_f32(in.val[1]))); - const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), - vqmovn_u32(vcvtq_u32_f32(in.val[3]))); - out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); + const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), vqmovn_u32(vcvtq_u32_f32(in.val[1]))); + const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), vqmovn_u32(vcvtq_u32_f32(in.val[3]))); + out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); } inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out) { - const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), - vqmovn_s32(vcvtq_s32_f32(in.val[1]))); - const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), - vqmovn_s32(vcvtq_s32_f32(in.val[3]))); - out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); + const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), vqmovn_s32(vcvtq_s32_f32(in.val[1]))); + const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), vqmovn_s32(vcvtq_s32_f32(in.val[3]))); + out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); } template <> @@ -552,8 +547,8 @@ inline float16x4_t vinvsqrt_f16(float16x4_t x) inline float16x8_t vinvsqrtq_f16(float16x8_t x) { float16x8_t sqrt_reciprocal = vrsqrteq_f16(x); - sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); return sqrt_reciprocal; } @@ -602,8 +597,8 @@ inline float16x4_t vtanh_rational_approx_f16(float16x4_t x16) inline float16x8_t vtanhq_f16(float16x8_t x) { // Split into high/low and use rational approximation on both parts exactly - const float16x8_t tanh = vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)), - vtanh_rational_approx_f16(vget_high_f16(x))); + const float16x8_t tanh = + vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)), vtanh_rational_approx_f16(vget_high_f16(x))); // tanh(x) == sign(x) to F16 precision for |x| >= 4.508, use sign after this const float16x8_t ONE = vdupq_n_f16(1.0f); diff --git a/src/core/NEON/NESymm.h b/src/core/NEON/NESymm.h index e6644577a1..ec246efc8c 100644 --- a/src/core/NEON/NESymm.h +++ b/src/core/NEON/NESymm.h @@ -25,7 +25,9 @@ #define ARM_COMPUTE_NESYMM_H #include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "src/core/NEON/NEMath.h" + #include namespace arm_compute @@ -49,13 +51,10 @@ using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 1 * @return Quantized values */ template -int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32, - int result_fixedpoint_multiplier, - int32_t result_shift, - int16x8_t min_s16, - int16x8_t max_s16) +int16x8_t finalize_quantization_int16( + int32x4x2_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int16x8_t min_s16, int16x8_t max_s16) { - if(result_shift < 0) + if (result_shift < 0) { in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift)); in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift)); @@ -76,7 +75,7 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32, // Convert S32 to S16 int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_s16 = vmaxq_s16(out_s16, min_s16); out_s16 = vminq_s16(out_s16, max_s16); @@ -98,13 +97,14 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32, * @return Quantized values */ template -inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier, - int32_t result_shift, int16_t min_s16, int16_t max_s16) +inline int16_t finalize_quantization_int16( + int32_t in_value, int result_fixedpoint_multiplier, int32_t result_shift, int16_t min_s16, int16_t max_s16) { - if(result_shift < 0) + if (result_shift < 0) { - const int64_t in_64 = static_cast(in_value) * (1 << (-result_shift)) * static_cast(result_fixedpoint_multiplier); - in_value = static_cast((in_64 + (1 << 30)) >> 31); + const int64_t in_64 = static_cast(in_value) * (1 << (-result_shift)) * + static_cast(result_fixedpoint_multiplier); + in_value = static_cast((in_64 + (1 << 30)) >> 31); } else { @@ -117,7 +117,7 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi // Bound the result int16_t out_s16 = static_cast(std::max(-32768, std::min(32767, in_value))); - if(is_bounded_relu) + if (is_bounded_relu) { out_s16 = static_cast(std::max(min_s16, std::min(max_s16, out_s16))); } @@ -134,14 +134,9 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi */ inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale) { - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x2_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale) - } - }; + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x2_t vdequantized_input = {{vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)}}; return vdequantized_input; } @@ -156,18 +151,13 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale) { const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x2_t rf = - { - { + const int32x4x2_t rf = {{ #ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) + vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) #else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) + vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) #endif //__aarch64__ - } - }; + }}; return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); } @@ -180,17 +170,14 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale) */ inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale), - } - }; + const float scale = qi.scale; + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale), + }}; return vdequantized_input; } @@ -206,24 +193,20 @@ inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQua const float scale = qi.scale; ARM_COMPUTE_ERROR_ON(scale == 0.f); const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { + const int32x4x4_t rf = {{ #ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), #else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), #endif //__aarch64__ - } - }; - const qsymm16x8x2_t res = - { + }}; + const qsymm16x8x2_t res = { vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])), vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])), }; diff --git a/src/core/NEON/SVEAsymm.h b/src/core/NEON/SVEAsymm.h index eea2627c62..a448cde475 100644 --- a/src/core/NEON/SVEAsymm.h +++ b/src/core/NEON/SVEAsymm.h @@ -26,6 +26,7 @@ #if defined(ARM_COMPUTE_ENABLE_SVE2) #include "src/core/NEON/SVEMath.h" + #include namespace arm_compute @@ -70,10 +71,18 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, float scal const auto voffset = svdup_n_s32(offset); const auto vscale = svdup_n_f32(scale); const svfloat32x4_t vdequantized_input = svcreate4_f32( - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), vscale)); + svmul_f32_z(pg, + svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), + vscale), + svmul_f32_z(pg, + svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), + vscale), + svmul_f32_z(pg, + svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), + vscale), + svmul_f32_z(pg, + svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), + vscale)); return vdequantized_input; } @@ -104,10 +113,10 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale const auto voffset = svdup_n_s32(offset); const auto vscale = svdup_n_f32(scale); const svfloat32x4_t vdequantized_input = svcreate4_f32( - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale)); + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale)); return vdequantized_input; } @@ -135,11 +144,11 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const Unifo */ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svfloat32x4_t vscale) { - const svfloat32x4_t vdequantized_input = svcreate4_f32( - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3))); + const svfloat32x4_t vdequantized_input = + svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3))); return vdequantized_input; } @@ -153,12 +162,12 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svflo */ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale) { - const auto vscale = svdup_n_f32(scale); - const svfloat32x4_t vdequantized_input = svcreate4_f32( - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale)); + const auto vscale = svdup_n_f32(scale); + const svfloat32x4_t vdequantized_input = + svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale)); return vdequantized_input; } diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h index 5ada7ae0ff..6d69b330ba 100644 --- a/src/core/NEON/SVEMath.h +++ b/src/core/NEON/SVEMath.h @@ -28,6 +28,7 @@ #include "src/core/NEON/wrapper/intrinsics/svcvt.h" #include "src/core/NEON/wrapper/intrinsics/svdup_n.h" #include "src/core/NEON/wrapper/intrinsics/svreinterpret.h" + #include #include @@ -181,9 +182,12 @@ svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b); * @return The converted integer vector */ template -int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3); +int_vec_type convert_float_to_int(const svfloat32_t &in_0, + const svfloat32_t &in_1, + const svfloat32_t &in_2, + const svfloat32_t &in_3); } // namespace arm_compute #include "src/core/NEON/SVEMath.inl" #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#endif /* ARM_COMPUTE_SVEMATH_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_SVEMATH_H */ diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl index 8973d0b273..b30125dcb7 100644 --- a/src/core/NEON/SVEMath.inl +++ b/src/core/NEON/SVEMath.inl @@ -32,8 +32,16 @@ namespace arm_compute { -inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t coeff_1, svfloat32_t coeff_2, svfloat32_t coeff_3, - svfloat32_t coeff_4, svfloat32_t coeff_5, svfloat32_t coeff_6, svfloat32_t coeff_7, svfloat32_t coeff_8) +inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, + svfloat32_t x, + svfloat32_t coeff_1, + svfloat32_t coeff_2, + svfloat32_t coeff_3, + svfloat32_t coeff_4, + svfloat32_t coeff_5, + svfloat32_t coeff_6, + svfloat32_t coeff_7, + svfloat32_t coeff_8) { const auto A = svmla_f32_z(pg, coeff_1, coeff_5, x); const auto B = svmla_f32_z(pg, coeff_3, coeff_7, x); @@ -45,8 +53,16 @@ inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t c return res; } -inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, svfloat16_t x, svfloat16_t coeff_1, svfloat16_t coeff_2, svfloat16_t coeff_3, - svfloat16_t coeff_4, svfloat16_t coeff_5, svfloat16_t coeff_6, svfloat16_t coeff_7, svfloat16_t coeff_8) +inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, + svfloat16_t x, + svfloat16_t coeff_1, + svfloat16_t coeff_2, + svfloat16_t coeff_3, + svfloat16_t coeff_4, + svfloat16_t coeff_5, + svfloat16_t coeff_6, + svfloat16_t coeff_7, + svfloat16_t coeff_8) { const auto A = svmla_f16_z(pg, coeff_1, coeff_5, x); const auto B = svmla_f16_z(pg, coeff_3, coeff_7, x); @@ -90,15 +106,17 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) const auto c4 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[3])); const auto c5 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[4])); - const auto shift = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f - const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f - const auto neg_ln2_hi = svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f - const auto neg_ln2_lo = svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f + const auto shift = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f + const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f + const auto neg_ln2_hi = + svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f + const auto neg_ln2_lo = + svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f const auto inf = svdup_n_f32(std::numeric_limits::infinity()); - const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5) + const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5) const auto zero = svdup_n_f32(0.f); - const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125) + const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125) // Range reduction: // e^x = 2^n * e^r @@ -114,23 +132,23 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) // (i.e. n) because the decimal part has been pushed out and lost. // * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent // in FP32 format. Left shifting z by 23 bits will result in 2^n. - const auto z = svmla_f32_z(pg, shift, x, inv_ln2); - const auto n = svsub_f32_z(pg, z, shift); - const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n + const auto z = svmla_f32_z(pg, shift, x, inv_ln2); + const auto n = svsub_f32_z(pg, z, shift); + const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32. // This outperforms longer Taylor series (3-4 tabs) both in term of accuracy and performance. const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi); - const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo); + const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo); // Compute the truncated Taylor series of e^r. // poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5) const auto r2 = svmul_f32_z(pg, r, r); - const auto p1 = svmul_f32_z(pg, c1, r); - const auto p23 = svmla_f32_z(pg, c2, c3, r); - const auto p45 = svmla_f32_z(pg, c4, c5, r); - const auto p2345 = svmla_f32_z(pg, p23, p45, r2); + const auto p1 = svmul_f32_z(pg, c1, r); + const auto p23 = svmla_f32_z(pg, c2, c3, r); + const auto p45 = svmla_f32_z(pg, c4, c5, r); + const auto p2345 = svmla_f32_z(pg, p23, p45, r2); const auto p12345 = svmla_f32_z(pg, p1, p2345, r2); auto poly = svmla_f32_z(pg, scale, p12345, scale); @@ -213,7 +231,8 @@ inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x) auto val = svreinterpret_f32_s32(svsub_s32_z(pg, svreinterpret_s32_f32(x), svlsl_n_s32_z(pg, m, 23))); // Polynomial Approximation - auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6, log_tab_7, log_tab_8); + auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6, + log_tab_7, log_tab_8); // Reconstruct poly = svmla_f32_z(pg, poly, svcvt_f32_s32_z(pg, m), CONST_LN2); @@ -259,7 +278,8 @@ inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val) //Find positive or negative const auto c_v = svabs_z(pg, wrapper::svcvt_z(pg, svmul_z(pg, val, ipi_v))); const auto sign_v = svcmple(pg, val, wrapper::svdup_n(ScalarType(0))); - const auto odd_v = svcmpne(pg, svand_z(pg, wrapper::svreinterpret(c_v), wrapper::svdup_n(IntType(1))), wrapper::svdup_n(IntType(0))); + const auto odd_v = svcmpne(pg, svand_z(pg, wrapper::svreinterpret(c_v), wrapper::svdup_n(IntType(1))), + wrapper::svdup_n(IntType(0))); auto neg_v = sveor_z(pg, odd_v, sign_v); @@ -347,7 +367,10 @@ inline svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b) #if defined(ARM_COMPUTE_ENABLE_SVE2) template <> -inline svuint8_t convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3) +inline svuint8_t convert_float_to_int(const svfloat32_t &in_0, + const svfloat32_t &in_1, + const svfloat32_t &in_2, + const svfloat32_t &in_3) { svuint8_t out; const auto all_true_pg = svptrue_b32(); @@ -381,7 +404,10 @@ inline svuint8_t convert_float_to_int(const svfloat32_t &in_0, const } template <> -inline svint8_t convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3) +inline svint8_t convert_float_to_int(const svfloat32_t &in_0, + const svfloat32_t &in_1, + const svfloat32_t &in_2, + const svfloat32_t &in_3) { svint8_t out; const auto all_true_pg = svptrue_b32(); diff --git a/src/core/NEON/SVESymm.h b/src/core/NEON/SVESymm.h index 6808577681..288d45d979 100644 --- a/src/core/NEON/SVESymm.h +++ b/src/core/NEON/SVESymm.h @@ -28,6 +28,7 @@ #if defined(ARM_COMPUTE_ENABLE_SVE2) #include "src/core/NEON/SVEMath.h" + #include namespace arm_compute @@ -42,8 +43,10 @@ namespace arm_compute */ inline svfloat32x2_t svdequantize_qsymm16_z(svbool_t pg, const svint16_t &qv, float scale) { - const auto vscale = svdup_n_f32(scale); - const svfloat32x2_t vdequantized_input = svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale), svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale)); + const auto vscale = svdup_n_f32(scale); + const svfloat32x2_t vdequantized_input = + svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale)); return vdequantized_input; } @@ -76,13 +79,13 @@ inline svint16_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x2_t qv, float */ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint16x2_t qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const auto vscale = svdup_n_f32(scale); - const svfloat32x4_t vdequantized_input = svcreate4_f32( - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale)); + const float scale = qi.scale; + const auto vscale = svdup_n_f32(scale); + const svfloat32x4_t vdequantized_input = + svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale)); return vdequantized_input; } @@ -112,4 +115,4 @@ inline svint16x2_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x4_t qv, con } // namespace arm_compute #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ -#endif // ARM_COMPUTE_NESYMM_H \ No newline at end of file +#endif // ARM_COMPUTE_NESYMM_H diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index 108b199df7..deb89996a9 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -28,18 +28,17 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - +#include "src/core/NEON/kernels/batchnormalization/impl/list.h" #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/NEON/kernels/batchnormalization/impl/list.h" -#include "src/core/common/Registrars.h" - #include namespace arm_compute @@ -52,8 +51,15 @@ struct BatchNormalizationSelectorData const CPUInfo &ci; }; using BatchNormalizationSelectorPtr = std::add_pointer::type; -using BatchNormalizationKernelPtr = std::add_pointer::type; +using BatchNormalizationKernelPtr = std::add_pointer::type; struct BatchNormalizationKernel { @@ -62,41 +68,32 @@ struct BatchNormalizationKernel BatchNormalizationKernelPtr ukernel; }; -static const BatchNormalizationKernel available_kernels[] = -{ +static const BatchNormalizationKernel available_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp16_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization) - }, - { - "sve_fp32_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization) - }, + {"sve_fp16_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, + REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)}, + {"sve_fp32_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, + REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)}, #endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization) - }, + {"neon_fp16_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)}, #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - { - "neon_fp32_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization) - }, + {"neon_fp32_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)}, #endif /* !defined(ARM_COMPUTE_ENABLE_NEON) */ }; const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -104,25 +101,31 @@ const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelec return nullptr; } -Status -validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_UNUSED(epsilon); - const auto *uk = get_implementation(BatchNormalizationSelectorData{ input->data_type(), CPUInfo::get() }); + const auto *uk = get_implementation(BatchNormalizationSelectorData{input->data_type(), CPUInfo::get()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - if(act_info.enabled()) + if (act_info.enabled()) { ActivationLayerInfo::ActivationFunction act = act_info.activation(); - ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU && + act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + act != + ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a()); } - if(nullptr != output) + if (nullptr != output) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -131,17 +134,18 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var); - if(beta != nullptr) + if (beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta); } - if(gamma != nullptr) + if (gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma); } - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); return Status{}; } @@ -169,10 +173,12 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win // Only compute denominator and constants once per feature map. int slice = -1; - const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (_beta != nullptr) ? reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (_gamma != nullptr) ? reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (_beta != nullptr) ? reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; T mean = static_cast(0); T var = static_cast(0); @@ -186,80 +192,83 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win auto beta_vec = wrapper::vdup_n(beta, ExactTagType{}); auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{}); const auto epsilon_vec = wrapper::vdup_n(static_cast(_epsilon), ExactTagType{}); - execute_window_loop(win_to_use, [&](const Coordinates & id) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - if(slice != id.z()) + execute_window_loop( + win_to_use, + [&](const Coordinates &id) { - mean = input_mean[id.z()]; - var = input_var[id.z()]; - mean_vec = wrapper::vdup_n(mean, ExactTagType{}); - var_vec = wrapper::vdup_n(var, ExactTagType{}); - if(input_gamma != nullptr) - { - gamma = input_gamma[id.z()]; - gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); - } - if(input_beta != nullptr) + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + if (slice != id.z()) { - beta = input_beta[id.z()]; - beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + mean = input_mean[id.z()]; + var = input_var[id.z()]; + mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + var_vec = wrapper::vdup_n(var, ExactTagType{}); + if (input_gamma != nullptr) + { + gamma = input_gamma[id.z()]; + gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + } + if (input_beta != nullptr) + { + beta = input_beta[id.z()]; + beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + } + + // Calculate denominator + denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + denominator = wrapper::vgetlane(denominator_vec, 0); + slice = id.z(); } - // Calculate denominator - denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - denominator = wrapper::vgetlane(denominator_vec, 0); - slice = id.z(); - } - - // Perform core calculations using vector operations - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Calculate x bar - const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); - const auto x_bar = wrapper::vmul(numerator, denominator_vec); - auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); - - // Perform fused activation - if(fused_activation) + // Perform core calculations using vector operations + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - activation_functor(res); + // Calculate x bar + const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); + const auto x_bar = wrapper::vmul(numerator, denominator_vec); + auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (fused_activation) + { + activation_functor(res); + } + + // Store results + wrapper::vstore(output_ptr + x, res); } - // Store results - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const T numerator = input_ptr[x] - mean; - const T x_bar = numerator * denominator; - T res = beta + x_bar * gamma; - - // Perform fused activation - if(fused_activation) + // Compute left-over elements + for (; x < window_end_x; ++x) { - activation_functor(res); + const T numerator = input_ptr[x] - mean; + const T x_bar = numerator * denominator; + T res = beta + x_bar * gamma; + + // Perform fused activation + if (fused_activation) + { + activation_functor(res); + } + + // Store results + *(output_ptr + x) = res; } - - // Store results - *(output_ptr + x) = res; - } - }, - input, output); + }, + input, output); } void NEBatchNormalizationLayerKernel::configure_non_fused() { - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw>; + _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw>; break; #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: @@ -274,23 +283,25 @@ void NEBatchNormalizationLayerKernel::configure_non_fused() void NEBatchNormalizationLayerKernel::configure_fused() { // NCHW Fused Batched Normalization with activation functions : FP32 - static std::map bn_fused_map_f32_nchw = - { - { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> } - }; + static std::map bn_fused_map_f32_nchw = { + {ActivationLayerInfo::ActivationFunction::RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}}; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // NCHW Fused Batched Normalization with activation functions : FP16 - static std::map bn_fused_map_f16_nchw = - { - { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> } - }; + static std::map bn_fused_map_f16_nchw = { + {ActivationLayerInfo::ActivationFunction::RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}}; #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: @@ -307,22 +318,32 @@ void NEBatchNormalizationLayerKernel::configure_fused() } NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info() + : _func(nullptr), + _input(nullptr), + _output(nullptr), + _mean(nullptr), + _var(nullptr), + _gamma(nullptr), + _beta(nullptr), + _epsilon(), + _act_info() { } -void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, - const ITensor *mean, const ITensor *var, - const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo act_info) +void NEBatchNormalizationLayerKernel::configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, - mean->info(), var->info(), - (beta != nullptr) ? beta->info() : nullptr, - (gamma != nullptr) ? gamma->info() : nullptr, - epsilon, act_info)); + mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr, + (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info)); _input = input; _output = input; @@ -334,16 +355,16 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, _act_info = act_info; const bool run_in_place = (output == nullptr) || (output == input); - if(!run_in_place) + if (!run_in_place) { _output = output; } // Configure activation function to run const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW; - if(is_nchw) + if (is_nchw) { - if(_act_info.enabled()) + if (_act_info.enabled()) { configure_fused(); } @@ -357,17 +378,21 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, Window win = calculate_max_window(*input->info(), Steps()); INEKernel::configure(win); - if(output != nullptr) + if (output != nullptr) { // Output auto initialization if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); } } -Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info)); @@ -382,13 +407,14 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_ERROR_ON(_func == nullptr && _input->info()->data_layout() == DataLayout::NCHW); const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW; - if(is_nchw) + if (is_nchw) { (this->*_func)(window); } else { - const auto *uk = get_implementation(BatchNormalizationSelectorData{ _input->info()->data_type(), CPUInfo::get() }); + const auto *uk = + get_implementation(BatchNormalizationSelectorData{_input->info()->data_type(), CPUInfo::get()}); uk->ukernel(_input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info, window); } } diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h index 0551ace30c..2e8ff0dc9a 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -68,7 +69,13 @@ public: * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. */ - void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f, + void configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta = nullptr, + const ITensor *gamma = nullptr, + float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel * @@ -85,10 +92,14 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr, - float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta = nullptr, + const ITensorInfo *gamma = nullptr, + float epsilon = 0.001f, + ActivationLayerInfo act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp index 83fb5f6f51..f299bb94a4 100644 --- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp @@ -27,8 +27,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status validate_arguments_static(const ITensorInfo *input, + int block_shape_x, + int block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); @@ -65,13 +70,14 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); - const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = compute_batch_to_space_shape( + input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); + const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output); } @@ -80,7 +86,13 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in } // namespace NEBatchToSpaceLayerKernel::NEBatchToSpaceLayerKernel() - : _input(nullptr), _block_shape(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _block_shape_x(), _block_shape_y(), _crop_info() + : _input(nullptr), + _block_shape(nullptr), + _output(nullptr), + _data_layout(DataLayout::UNKNOWN), + _block_shape_x(), + _block_shape_y(), + _crop_info() { } @@ -99,15 +111,18 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const ITensor *b ICPPKernel::configure(win); } -void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) +void NEBatchToSpaceLayerKernel::configure( + const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); + const TensorShape output_shape = compute_batch_to_space_shape( + input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); // Output auto initialization if not yet initialized auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); _input = input; _output = output; @@ -121,14 +136,19 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_sh ICPPKernel::configure(win); } -Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output)); return Status{}; } -Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info)); @@ -141,7 +161,7 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - if(_block_shape != nullptr) + if (_block_shape != nullptr) { // Retrieve the block shapes dynamically _block_shape_x = *(reinterpret_cast(_block_shape->ptr_to_element(0))); @@ -155,31 +175,32 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info int batch_id = 0; // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - - const int x = id.x(); - const int y = id.y(); - const int z = id.z(); - // Translate x, y to uncropped version - const int x_c = x + _crop_info.left; - const int y_c = y + _crop_info.top; - - const int in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; - const int in_x = x_c / _block_shape_x; - const int in_y = y_c / _block_shape_y; - Coordinates input_coords{ in_x, in_y, z, in_batch }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const int x = id.x(); + const int y = id.y(); + const int z = id.z(); + // Translate x, y to uncropped version + const int x_c = x + _crop_info.left; + const int y_c = y + _crop_info.top; + + const int in_batch = + batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; + const int in_x = x_c / _block_shape_x; + const int in_y = y_c / _block_shape_y; + Coordinates input_coords{in_x, in_y, z, in_batch}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } else { @@ -188,26 +209,28 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - - const int x = id.y(); - const int y = id.z(); - - // Translate x, y to uncropped version - const int x_c = x + _crop_info.left; - const int y_c = y + _crop_info.top; - - const int in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; - const int in_x = x_c / _block_shape_x; - const int in_y = y_c / _block_shape_y; - Coordinates input_coords{ 0, in_x, in_y, in_batch }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size * _input->info()->dimension(0)); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const int x = id.y(); + const int y = id.z(); + + // Translate x, y to uncropped version + const int x_c = x + _crop_info.left; + const int y_c = y + _crop_info.top; + + const int in_batch = + batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; + const int in_x = x_c / _block_shape_x; + const int in_y = y_c / _block_shape_y; + Coordinates input_coords{0, in_x, in_y, in_batch}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), + element_size * _input->info()->dimension(0)); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h index 5eceee0904..d98ac621b0 100644 --- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h +++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -68,7 +69,11 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed */ - void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info = CropInfo{}); + void configure(const ITensor *input, + int32_t block_shape_x, + int32_t block_shape_y, + ITensor *output, + const CropInfo &crop_info = CropInfo{}); /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -90,7 +95,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{}); + static Status validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info = CropInfo{}); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp index 677c5cddcc..a59bbd233b 100644 --- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include @@ -55,8 +56,7 @@ inline void bitwise_and(const T *__restrict input1, const T *__restrict input2, } } // namespace -NEBitwiseAndKernel::NEBitwiseAndKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +NEBitwiseAndKernel::NEBitwiseAndKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -86,8 +86,7 @@ void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2, Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, - AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access); @@ -103,9 +102,7 @@ void NEBitwiseAndKernel::run(const Window &window, const ThreadInfo &info) Iterator input2(_input2, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_and(input1.ptr(), input2.ptr(), output.ptr()); - }, - input1, input2, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_and(input1.ptr(), input2.ptr(), output.ptr()); }, input1, + input2, output); } diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp index 19b1af690a..ecd181a7af 100644 --- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,8 +51,7 @@ inline void bitwise_not_U8_U8(const uint8_t *__restrict input, uint8_t *__restri } } // namespace -NEBitwiseNotKernel::NEBitwiseNotKernel() - : _input(nullptr), _output(nullptr) +NEBitwiseNotKernel::NEBitwiseNotKernel() : _input(nullptr), _output(nullptr) { } @@ -77,7 +77,8 @@ void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output) // Configure kernel window Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access); + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_access); INEKernel::configure(win); } @@ -90,9 +91,6 @@ void NEBitwiseNotKernel::run(const Window &window, const ThreadInfo &info) Iterator input(_input, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_not_U8_U8(input.ptr(), output.ptr()); - }, - input, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_not_U8_U8(input.ptr(), output.ptr()); }, input, output); } diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp index 08094fbfcf..4c906134aa 100644 --- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,7 +43,8 @@ class Coordinates; namespace { -inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) +inline void +bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) { const uint8x16_t val1 = vld1q_u8(input1); const uint8x16_t val2 = vld1q_u8(input2); @@ -51,8 +53,7 @@ inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t } } // namespace -NEBitwiseOrKernel::NEBitwiseOrKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +NEBitwiseOrKernel::NEBitwiseOrKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -82,8 +83,7 @@ void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2, Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, - AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access); @@ -99,9 +99,7 @@ void NEBitwiseOrKernel::run(const Window &window, const ThreadInfo &info) Iterator input2(_input2, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); - }, - input1, input2, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1, + input2, output); } diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp index fc5b38b64f..dbbed2483c 100644 --- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,7 +43,8 @@ class Coordinates; namespace { -inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) +inline void +bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) { const uint8x16_t val1 = vld1q_u8(input1); const uint8x16_t val2 = vld1q_u8(input2); @@ -51,8 +53,7 @@ inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t } } // namespace -NEBitwiseXorKernel::NEBitwiseXorKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +NEBitwiseXorKernel::NEBitwiseXorKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -82,7 +83,8 @@ void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2, AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), - AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access); + AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), + output_access); INEKernel::configure(win); } @@ -96,9 +98,7 @@ void NEBitwiseXorKernel::run(const Window &window, const ThreadInfo &info) Iterator input2(_input2, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); - }, - input1, input2, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1, + input2, output); } diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp index 69bfd56ce0..cb869838e2 100644 --- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp +++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp @@ -27,8 +27,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/boundingboxtransform/list.h" @@ -45,7 +46,11 @@ struct BoundingBoxTransformSelectorData }; using BoundingBoxTransformSelctorPtr = std::add_pointer::type; -using BoundingBoxTransformUKernelPtr = std::add_pointer::type; +using BoundingBoxTransformUKernelPtr = std::add_pointer::type; struct BoundingBoxTransformKernel { @@ -54,26 +59,19 @@ struct BoundingBoxTransformKernel BoundingBoxTransformUKernelPtr ukernel; }; -static const BoundingBoxTransformKernel available_kernels[] = -{ - { - "fp32_neon_boundingboxtransform", - [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform) - }, +static const BoundingBoxTransformKernel available_kernels[] = { + {"fp32_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_boundingboxtransform", - [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform) - }, + {"fp16_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #if defined(ARM_COMPUTE_ENABLE_NEON) - { - "qu16_neon_boundingboxtransform", - [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform) - }, + {"qu16_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)}, #endif //defined(ARM_COMPUTE_ENABLE_NEON) }; @@ -85,9 +83,9 @@ static const BoundingBoxTransformKernel available_kernels[] = */ const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -95,7 +93,10 @@ const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformS return nullptr; } -Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status validate_arguments(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(boxes); @@ -108,7 +109,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0); - if(boxes->data_type() == DataType::QASYMM16) + if (boxes->data_type() == DataType::QASYMM16) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(deltas, 1, DataType::QASYMM8); const UniformQuantizationInfo deltas_qinfo = deltas->quantization_info().uniform(); @@ -120,12 +121,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas); } - if(pred_boxes->total_size() > 0) + if (pred_boxes->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas); ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2); - if(pred_boxes->data_type() == DataType::QASYMM16) + if (pred_boxes->data_type() == DataType::QASYMM16) { const UniformQuantizationInfo pred_qinfo = pred_boxes->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(pred_qinfo.scale != 0.125f); @@ -142,13 +143,19 @@ NEBoundingBoxTransformKernel::NEBoundingBoxTransformKernel() { } -void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info) +void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info)); // Configure kernel window - auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info())); + auto_init_if_empty(*pred_boxes->info(), deltas->info() + ->clone() + ->set_data_type(boxes->info()->data_type()) + .set_quantization_info(boxes->info()->quantization_info())); // Set instance variables _boxes = boxes; @@ -164,7 +171,10 @@ void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred INEKernel::configure(win); } -Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info)); return Status{}; @@ -176,7 +186,7 @@ void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &i ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() }); + const auto *uk = get_implementation(BoundingBoxTransformSelectorData{_boxes->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window); diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h index def827836c..3915994feb 100644 --- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h +++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h @@ -63,7 +63,8 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. * */ - void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info); + void + configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform * @@ -77,7 +78,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info); + static Status validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp index 64da1f2262..3b53b7055f 100644 --- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp +++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,15 +45,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC); - const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); + const unsigned int channels = + input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + num_groups == channels, + "Channel shuffling with same number of groups as number of channels would be inefficient"); ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); // There cannot be more groups than channels - ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, + "The number of channels must be a multiple of the number of groups"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -72,20 +77,22 @@ void channel_shuffle_nhwc(const ITensor *input, ITensor *output, unsigned int nu Iterator in(input, window); - execute_window_loop(window, [&](const Coordinates & id) - { - // Shuffle channel - const unsigned int curr_channel = id.x(); - const unsigned int group_id = curr_channel * rK; - const unsigned int r = group_id * K; - const unsigned int channel_id = curr_channel - r; - - // Calculate output coordinates - Coordinates out_coords = id; - out_coords.set(Window::DimX, channel_id * num_groups + group_id); - std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords)); - }, - in); + execute_window_loop( + window, + [&](const Coordinates &id) + { + // Shuffle channel + const unsigned int curr_channel = id.x(); + const unsigned int group_id = curr_channel * rK; + const unsigned int r = group_id * K; + const unsigned int channel_id = curr_channel - r; + + // Calculate output coordinates + Coordinates out_coords = id; + out_coords.set(Window::DimX, channel_id * num_groups + group_id); + std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords)); + }, + in); } void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window) { @@ -107,34 +114,35 @@ void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int nu Iterator in(input, win); - execute_window_loop(win, [&](const Coordinates & id) - { - // Shuffle channel - const unsigned int curr_channel = id.z(); - const unsigned int group_id = curr_channel * rK; - const unsigned int r = group_id * K; - const unsigned int channel_id = curr_channel - r; - - // Calculate output coordinates - Coordinates out_coords = id; - out_coords.set(Window::DimZ, channel_id * num_groups + group_id); - const uint8_t *input_ptr = in.ptr(); - uint8_t *output_ptr = output->ptr_to_element(out_coords); - - // Copy plane - for(unsigned int y = 0; y < height; ++y) + execute_window_loop( + win, + [&](const Coordinates &id) { - std::copy_n(input_ptr, row_size, output_ptr); - input_ptr += input_stride_y; - output_ptr += output_stride_y; - } - }, - in); + // Shuffle channel + const unsigned int curr_channel = id.z(); + const unsigned int group_id = curr_channel * rK; + const unsigned int r = group_id * K; + const unsigned int channel_id = curr_channel - r; + + // Calculate output coordinates + Coordinates out_coords = id; + out_coords.set(Window::DimZ, channel_id * num_groups + group_id); + const uint8_t *input_ptr = in.ptr(); + uint8_t *output_ptr = output->ptr_to_element(out_coords); + + // Copy plane + for (unsigned int y = 0; y < height; ++y) + { + std::copy_n(input_ptr, row_size, output_ptr); + input_ptr += input_stride_y; + output_ptr += output_stride_y; + } + }, + in); } } // namespace -NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() - : _input(nullptr), _output(nullptr), _num_groups() +NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr), _num_groups() { } @@ -158,7 +166,8 @@ void NEChannelShuffleLayerKernel::configure(const ITensor *input, ITensor *outpu INEKernel::configure(win); } -Status NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) +Status +NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups)); return Status{}; @@ -170,7 +179,7 @@ void NEChannelShuffleLayerKernel::run(const Window &window, const ThreadInfo &in ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - switch(_input->info()->data_layout()) + switch (_input->info()->data_layout()) { case DataLayout::NHWC: channel_shuffle_nhwc(_input, _output, _num_groups, window); diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h index 1976302036..bc6652fd30 100644 --- a/src/core/NEON/kernels/NECol2ImKernel.h +++ b/src/core/NEON/kernels/NECol2ImKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_NECOL2IMKERNEL_H #define ARM_COMPUTE_NECOL2IMKERNEL_H -#include "src/core/NEON/INEKernel.h" - #include "arm_compute/core/Size2D.h" +#include "src/core/NEON/INEKernel.h" + namespace arm_compute { class ITensor; diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp index 94c455305c..60271fbc74 100644 --- a/src/core/NEON/kernels/NECropKernel.cpp +++ b/src/core/NEON/kernels/NECropKernel.cpp @@ -26,14 +26,15 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Window.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/utils/helpers/bit_ops.h" #include "src/cpu/kernels/crop/list.h" @@ -47,7 +48,8 @@ struct CropSelectorData }; using CropSelectorPtr = std::add_pointer::type; -using CropUKernelPtr = std::add_pointer::type; +using CropUKernelPtr = std::add_pointer::type; struct CropUKernel { @@ -56,48 +58,23 @@ struct CropUKernel CropUKernelPtr ukernel; }; -static const CropUKernel available_kernels[] = -{ - { - "fp16_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window) - }, - { - "f32_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window) - }, - { - "u8_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window) - }, - { - "u16_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::U16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window) - }, - { - "u32_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::U32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window) - }, - { - "s8_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::S8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window) - }, - { - "s16_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window) - }, - { - "s32_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window) - }, +static const CropUKernel available_kernels[] = { + {"fp16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)}, + {"f32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)}, + {"u8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)}, + {"u16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)}, + {"u32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)}, + {"s8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)}, + {"s16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)}, + {"s32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)}, }; /** Micro-kernel selector @@ -108,9 +85,9 @@ static const CropUKernel available_kernels[] = */ const CropUKernel *get_implementation(const CropSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -119,26 +96,40 @@ const CropUKernel *get_implementation(const CropSelectorData &data) return nullptr; } -inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit) +inline void out_of_bounds_crop_window(const ITensor *output, + float *output_ptr, + float extrapolation_value, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit) { - auto in = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag()); - int32_t x = 0; - int32_t limit = (output_width_limit - output_width_start) * static_cast(output->info()->dimension(0)); - float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0); - for(; x <= limit - window_step_x; x += window_step_x) + auto in = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag()); + int32_t x = 0; + int32_t limit = (output_width_limit - output_width_start) * static_cast(output->info()->dimension(0)); + float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0); + for (; x <= limit - window_step_x; x += window_step_x) { wrapper::vstore(output_start_ptr + x, in); } - for(; x < limit; ++x) + for (; x < limit; ++x) { *(output_start_ptr + x) = extrapolation_value; } } -inline void execute_window(const ITensor *input, const ITensor *output, Coordinates input_offset, float extrapolation_value, - const std::array &rows_out_of_bounds, const std::array &cols_out_of_bounds, NECropKernel::InBoundsCropFunction *in_bounds_crop_function, - bool is_height_flipped, bool has_cols_in_bounds, bool has_cols_out_of_bounds_before, bool has_cols_out_of_bounds_after, bool input_has_single_channel, bool is_width_flipped) +inline void execute_window(const ITensor *input, + const ITensor *output, + Coordinates input_offset, + float extrapolation_value, + const std::array &rows_out_of_bounds, + const std::array &cols_out_of_bounds, + NECropKernel::InBoundsCropFunction *in_bounds_crop_function, + bool is_height_flipped, + bool has_cols_in_bounds, + bool has_cols_out_of_bounds_before, + bool has_cols_out_of_bounds_after, + bool input_has_single_channel, + bool is_width_flipped) { // Output is always float. const int window_step_x = 16 / sizeof(float); @@ -159,45 +150,66 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina // |------------------------------| // Fill all output rows that have no elements that are within the input bounds with the extrapolation value. // First for the rows before the in bounds rows. - out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1)); + out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, + rows_out_of_bounds[0] * output->info()->dimension(1)); output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0); // Iterate through each row that has any elements within the input bounds. - for(uint32_t row = rows_out_of_bounds[0]; static_cast(row) < static_cast(output->info()->dimension(2) - rows_out_of_bounds[1]); - ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2]) + for (uint32_t row = rows_out_of_bounds[0]; + static_cast(row) < static_cast(output->info()->dimension(2) - rows_out_of_bounds[1]); + ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2]) { // Fill all elements in the row that are out of bounds with the extrapolation value. // First for the elements before the in bounds elements. - if(has_cols_out_of_bounds_before) + if (has_cols_out_of_bounds_before) { out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]); } // Copy all elements within the input bounds from the input tensor. - if(has_cols_in_bounds) + if (has_cols_in_bounds) { (*in_bounds_crop_function)(input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0], - output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, is_width_flipped); + output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, + is_width_flipped); } // Fill all elements after the in bounds elements with the extrapolation value. - if(has_cols_out_of_bounds_after) + if (has_cols_out_of_bounds_after) { - out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1)); + out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, + output->info()->dimension(1) - cols_out_of_bounds[1], + output->info()->dimension(1)); } output_ptr += output->info()->dimension(1) * output->info()->dimension(0); } // Fill all rows after the in bounds elements with the extrapolation value. - out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1)); + out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, + rows_out_of_bounds[1] * output->info()->dimension(1)); } } // namespace NECropKernel::NECropKernel() - : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds() + : _input(nullptr), + _crop_boxes(nullptr), + _box_ind(nullptr), + _output(nullptr), + _start(), + _end(), + _crop_box_ind(0), + _extrapolation_value(0), + _rows_out_of_bounds(), + _cols_out_of_bounds() { } -void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind, float extrapolation_value) +void NECropKernel::configure(const ITensor *input, + const ITensor *crop_boxes, + const ITensor *box_ind, + ITensor *output, + uint32_t crop_box_ind, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), crop_box_ind, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), + crop_box_ind, extrapolation_value)); _input = input; _crop_boxes = crop_boxes; @@ -207,21 +219,27 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co _extrapolation_value = extrapolation_value; } -Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value) +Status NECropKernel::validate(const ITensorInfo *input, + const ITensorInfo *crop_boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + uint32_t crop_box_ind, + float extrapolation_value) { ARM_COMPUTE_UNUSED(extrapolation_value); - const auto *uk = get_implementation(CropSelectorData{ input->data_type() }); + const auto *uk = get_implementation(CropSelectorData{input->data_type()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, + DataType::F16, DataType::U32, DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4); ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]); ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] <= crop_box_ind); ARM_COMPUTE_RETURN_ERROR_ON(box_ind->tensor_shape()[0] <= crop_box_ind); - if(output->total_size() > 0) + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -242,48 +260,53 @@ void NECropKernel::configure_output_shape() // The normalized coordiantes are scaled to retrieve the floating point image coordinates which are rounded to integers. _start = Coordinates(std::floor(x0 * (_input->info()->tensor_shape()[1] - 1) + 0.5f), std::floor(y0 * (_input->info()->tensor_shape()[2] - 1) + 0.5f)); - _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f), - std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f)); - const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, abs(_end[1] - _start[1]) + 1); + _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f), + std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f)); + const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, + abs(_end[1] - _start[1]) + 1); _output->info()->set_tensor_shape(out_shape); bool is_width_flipped = _end[0] < _start[0]; bool is_height_flipped = _end[1] < _start[1]; - if(is_height_flipped) + if (is_height_flipped) { - _rows_out_of_bounds[0] = _start[1] >= static_cast(_input->info()->dimension(2)) ? std::min(static_cast(_start[1] - _input->info()->dimension(2) + 1), - static_cast(_output->info()->dimension(2))) : - 0; + _rows_out_of_bounds[0] = _start[1] >= static_cast(_input->info()->dimension(2)) + ? std::min(static_cast(_start[1] - _input->info()->dimension(2) + 1), + static_cast(_output->info()->dimension(2))) + : 0; _rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast(-_end[1]), - static_cast(_output->info()->dimension(2))) : - 0; + static_cast(_output->info()->dimension(2))) + : 0; } else { _rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast(-_start[1]), - static_cast(_output->info()->dimension(2))) : - 0; - _rows_out_of_bounds[1] = _end[1] >= static_cast(_input->info()->dimension(2)) ? std::min(static_cast(_end[1] - _input->info()->dimension(2) + 1), - static_cast(_output->info()->dimension(2))) : - 0; + static_cast(_output->info()->dimension(2))) + : 0; + _rows_out_of_bounds[1] = _end[1] >= static_cast(_input->info()->dimension(2)) + ? std::min(static_cast(_end[1] - _input->info()->dimension(2) + 1), + static_cast(_output->info()->dimension(2))) + : 0; } - if(is_width_flipped) + if (is_width_flipped) { - _cols_out_of_bounds[0] = _start[0] >= static_cast(_input->info()->dimension(1)) ? std::min(static_cast(_start[0] - _input->info()->dimension(1) + 1), - static_cast(_output->info()->dimension(1))) : - 0; + _cols_out_of_bounds[0] = _start[0] >= static_cast(_input->info()->dimension(1)) + ? std::min(static_cast(_start[0] - _input->info()->dimension(1) + 1), + static_cast(_output->info()->dimension(1))) + : 0; _cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast(-_end[0]), - static_cast(_output->info()->dimension(1))) : - 0; + static_cast(_output->info()->dimension(1))) + : 0; } else { _cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast(-_start[0]), - static_cast(_output->info()->dimension(1))) : - 0; - _cols_out_of_bounds[1] = _end[0] >= static_cast(_input->info()->dimension(1)) ? std::min(static_cast(_end[0] - _input->info()->dimension(1) + 1), - static_cast(_output->info()->dimension(1))) : - 0; + static_cast(_output->info()->dimension(1))) + : 0; + _cols_out_of_bounds[1] = _end[0] >= static_cast(_input->info()->dimension(1)) + ? std::min(static_cast(_end[0] - _input->info()->dimension(1) + 1), + static_cast(_output->info()->dimension(1))) + : 0; } INEKernel::configure(calculate_max_window(*_output->info())); @@ -298,13 +321,18 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON(_input->info()->has_padding()); ARM_COMPUTE_ERROR_ON(_output->info()->has_padding()); - const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() }); + const auto *uk = get_implementation(CropSelectorData{_input->info()->data_type()}); uint32_t batch_index = *(reinterpret_cast(_box_ind->ptr_to_element(Coordinates(_crop_box_ind)))); - Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0], - _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index); - execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1], - _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0, + Coordinates input_offset( + 0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0], + _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index); + execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, + uk->ukernel, + _end[1]<_start[1], + _cols_out_of_bounds[0] + + _cols_out_of_bounds[1]<_output->info()->dimension(1), _cols_out_of_bounds[0]> 0, + _cols_out_of_bounds[1]> 0, _start[0] <= _end[0], _end[0] < _start[0]); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h index 6c989c1d2c..da4a1b26e5 100644 --- a/src/core/NEON/kernels/NECropKernel.h +++ b/src/core/NEON/kernels/NECropKernel.h @@ -25,7 +25,7 @@ #define ARM_COMPUTE_NEON_CROP_KERNEL_H #include "arm_compute/core/Types.h" -#include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -67,7 +67,12 @@ public: * @param[in] crop_box_ind Index of the crop box to be used from @p crop_boxes. Default is 0. * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. */ - void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0); + void configure(const ITensor *input, + const ITensor *crop_boxes, + const ITensor *box_ind, + ITensor *output, + uint32_t crop_box_ind = 0, + float extrapolation_value = 0); /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel * @@ -82,7 +87,12 @@ public: * @param[in] crop_box_ind Index of the crop box to be used from @p crop_boxes. Default is 0. * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0); + static Status validate(const ITensorInfo *input, + const ITensorInfo *crop_boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + uint32_t crop_box_ind = 0, + float extrapolation_value = 0); /** Configure output tensor's shape as this can only be determined at runtime. */ void configure_output_shape(); @@ -91,7 +101,8 @@ public: void run(const Window &window, const ThreadInfo &info) override; /** Function to use for in bounds crop for the particular tensor types passed to configure() */ - using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool); + using InBoundsCropFunction = + void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool); private: const ITensor *_input; diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp index 6dcc85ec2e..de0079ee60 100644 --- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include @@ -52,12 +53,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width])); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != + (block_shape * input->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != + (block_shape * input->tensor_shape()[idx_height])); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -74,7 +77,8 @@ NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel() void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); + TensorShape output_shape = + compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); // Output auto inizialitation if not yet initialized auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); @@ -117,26 +121,27 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { Window slice_in = window.first_slice_window_2D(); do { Iterator in(_input, slice_in); - execute_window_loop(slice_in, [&](const Coordinates & id) - { - const int x = id.x(); - const int y = id.y(); - - const int z = id.z() % r; - const int out_x = x * _block_shape + (id.z() / r) % _block_shape; - const int out_y = y * _block_shape + (id.z() / r) / _block_shape; - Coordinates output_coords{ out_x, out_y, z, id[3] }; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } - while(window.slide_window_slice_2D(slice_in)); + execute_window_loop( + slice_in, + [&](const Coordinates &id) + { + const int x = id.x(); + const int y = id.y(); + + const int z = id.z() % r; + const int out_x = x * _block_shape + (id.z() / r) % _block_shape; + const int out_y = y * _block_shape + (id.z() / r) / _block_shape; + Coordinates output_coords{out_x, out_y, z, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_2D(slice_in)); } else { @@ -144,20 +149,21 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info do { Iterator in(_input, slice_in); - execute_window_loop(slice_in, [&](const Coordinates & id) - { - const int x = id.y(); - const int y = id.z(); - - const int z = id.x() % r; - const int out_x = x * _block_shape + (id.x() / r) % _block_shape; - const int out_y = y * _block_shape + (id.x() / r) / _block_shape; - Coordinates output_coords{ z, out_x, out_y, id[3] }; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } - while(window.slide_window_slice_3D(slice_in)); + execute_window_loop( + slice_in, + [&](const Coordinates &id) + { + const int x = id.y(); + const int y = id.z(); + + const int z = id.x() % r; + const int out_x = x * _block_shape + (id.x() / r) % _block_shape; + const int out_y = y * _block_shape + (id.x() / r) / _block_shape; + Coordinates output_coords{z, out_x, out_y, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_3D(slice_in)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp index 261437f07d..a5969cd497 100644 --- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp +++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -37,16 +38,19 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32); - ARM_COMPUTE_RETURN_ERROR_ON(std::set({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set({0, 1}).count(config.axis) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -56,7 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +std::pair validate_and_configure_window(ITensorInfo *input, + ITensorInfo *output, + ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_UNUSED(idx, config); @@ -68,12 +75,14 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr) +NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr) { } -void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config) +void NEFFTDigitReverseKernel::configure(const ITensor *input, + ITensor *output, + const ITensor *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config)); @@ -91,11 +100,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); - if(axis == 0) + if (axis == 0) { - if(is_input_complex) + if (is_input_complex) { - if(is_conj) + if (is_conj) { _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0; } @@ -109,11 +118,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0; } } - else if(axis == 1) + else if (axis == 1) { - if(is_input_complex) + if (is_input_complex) { - if(is_conj) + if (is_conj) { _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1; } @@ -133,10 +142,14 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c } } -Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); return Status{}; } @@ -159,38 +172,40 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0(const Window &window) std::vector buffer_row_out(2 * N); std::vector buffer_row_in(2 * N); - execute_window_loop(slice, [&](const Coordinates &) - { - if(is_input_complex) + execute_window_loop( + slice, + [&](const Coordinates &) { - // Load - memcpy(buffer_row_in.data(), reinterpret_cast(in.ptr()), 2 * N * sizeof(float)); - - // Shuffle - for(size_t x = 0; x < 2 * N; x += 2) + if (is_input_complex) { - size_t idx = buffer_idx[x / 2]; - buffer_row_out[x] = buffer_row_in[2 * idx]; - buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]); - } - } - else - { - // Load - memcpy(buffer_row_in.data(), reinterpret_cast(in.ptr()), N * sizeof(float)); + // Load + memcpy(buffer_row_in.data(), reinterpret_cast(in.ptr()), 2 * N * sizeof(float)); - // Shuffle - for(size_t x = 0; x < N; ++x) + // Shuffle + for (size_t x = 0; x < 2 * N; x += 2) + { + size_t idx = buffer_idx[x / 2]; + buffer_row_out[x] = buffer_row_in[2 * idx]; + buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]); + } + } + else { - size_t idx = buffer_idx[x]; - buffer_row_out[2 * x] = buffer_row_in[idx]; + // Load + memcpy(buffer_row_in.data(), reinterpret_cast(in.ptr()), N * sizeof(float)); + + // Shuffle + for (size_t x = 0; x < N; ++x) + { + size_t idx = buffer_idx[x]; + buffer_row_out[2 * x] = buffer_row_in[idx]; + } } - } - // Copy back - memcpy(reinterpret_cast(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float)); - }, - in, out); + // Copy back + memcpy(reinterpret_cast(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float)); + }, + in, out); } template @@ -215,39 +230,41 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1(const Window &window) const size_t stride_z = _input->info()->strides_in_bytes()[2]; const size_t stride_w = _input->info()->strides_in_bytes()[3]; - execute_window_loop(slice, [&](const Coordinates & id) - { - auto *out_ptr = reinterpret_cast(out.ptr()); - auto *in_ptr = reinterpret_cast(_input->buffer() + id.z() * stride_z + id[3] * stride_w); - const size_t y_shuffled = buffer_idx[id.y()]; - - if(is_input_complex) + execute_window_loop( + slice, + [&](const Coordinates &id) { - // Shuffle the entire row into the output - memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float)); + auto *out_ptr = reinterpret_cast(out.ptr()); + auto *in_ptr = reinterpret_cast(_input->buffer() + id.z() * stride_z + id[3] * stride_w); + const size_t y_shuffled = buffer_idx[id.y()]; - // Conjugate if necessary - if(is_conj) + if (is_input_complex) { - for(size_t x = 0; x < 2 * Nx; x += 2) + // Shuffle the entire row into the output + memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float)); + + // Conjugate if necessary + if (is_conj) { - out_ptr[x + 1] = -out_ptr[x + 1]; + for (size_t x = 0; x < 2 * Nx; x += 2) + { + out_ptr[x + 1] = -out_ptr[x + 1]; + } } } - } - else - { - // Shuffle the entire row into the buffer - memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float)); - - // Copy the buffer to the output, with a zero imaginary part - for(size_t x = 0; x < 2 * Nx; x += 2) + else { - out_ptr[x] = buffer_row[x / 2]; + // Shuffle the entire row into the buffer + memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float)); + + // Copy the buffer to the output, with a zero imaginary part + for (size_t x = 0; x < 2 * Nx; x += 2) + { + out_ptr[x] = buffer_row[x / 2]; + } } - } - }, - out); + }, + out); } void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h index f436c364b2..ecf85ebc98 100644 --- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h +++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -70,7 +71,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp index 44c841f626..4b58a7b9ac 100644 --- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp +++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp @@ -28,10 +28,11 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/traits.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/traits.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "support/ToolchainSupport.h" #include @@ -70,7 +71,7 @@ float32x2_t c_mul_neon(float32x2_t a, float32x2_t b) { using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - const float32x2_t mask = { -1.0, 1.0 }; + const float32x2_t mask = {-1.0, 1.0}; const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); @@ -88,7 +89,7 @@ float32x2_t c_mul_neon_img(float32x2_t a, float img_constant) const float a_r = wrapper::vgetlane(a, 0); const float a_i = wrapper::vgetlane(a, 1); - const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant }); + const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant}); return out; } @@ -100,7 +101,8 @@ float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_ return wrapper::vadd(t2, e); } -float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7) +float32x2_t reduce_sum_7( + float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7) { const auto t0 = wrapper::vadd(x1, x2); const auto t1 = wrapper::vadd(x3, x4); @@ -111,7 +113,14 @@ float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32 return wrapper::vadd(t00, t01); } -float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8) +float32x2_t reduce_sum_8(float32x2_t x1, + float32x2_t x2, + float32x2_t x3, + float32x2_t x4, + float32x2_t x5, + float32x2_t x6, + float32x2_t x7, + float32x2_t x8) { const auto t0 = wrapper::vadd(x1, x2); const auto t1 = wrapper::vadd(x3, x4); @@ -141,15 +150,21 @@ void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w, x = wrapper::vadd(a, b); x = wrapper::vadd(x, c); - const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c)); - const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c)); + const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c)); + const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c)); y = z = wrapper::vsub(a, v1); y = wrapper::vadd(y, v2); z = wrapper::vsub(z, v2); } -void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3) +void fft_4(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3) { float32x2_t a = x1; float32x2_t b = c_mul_neon(w, x2); @@ -173,7 +188,15 @@ void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, c x4 = wrapper::vadd(x41, x42); } -void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4) +void fft_5(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3, + const float32x2_t &w4) { const auto a = x1; const auto b = c_mul_neon(w, x2); @@ -181,25 +204,25 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto d = c_mul_neon(w3, x4); const auto e = c_mul_neon(w4, x5); - const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b); - const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b); - const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b); + const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b); + const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b); + const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b); + const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b); - const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c); - const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c); - const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c); + const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c); + const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c); + const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c); + const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c); - const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d); - const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d); + const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d); + const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d); + const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d); + const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d); - const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e); - const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e); - const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e); + const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e); + const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e); + const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e); + const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e); x1 = reduce_sum_5(a, b, c, d, e); x2 = reduce_sum_5(a, b0, c0, d0, e0); @@ -208,9 +231,19 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f x5 = reduce_sum_5(a, b3, c3, d3, e3); } -void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, +void fft_7(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + float32x2_t &x6, + float32x2_t &x7, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3, const float32x2_t &w4, - const float32x2_t &w5, const float32x2_t &w6) + const float32x2_t &w5, + const float32x2_t &w6) { const auto a = x1; const auto b = c_mul_neon(w, x2); @@ -220,47 +253,47 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto f = c_mul_neon(w5, x6); const auto g = c_mul_neon(w6, x7); - const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b); - const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b); - const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b); - const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b); - const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b); - - const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c); - const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c); - const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c); - const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c); - const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c); - - const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d); - const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d); - const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d); - const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d); - - const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e); - const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e); - const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e); - const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e); - const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e); - - const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f); - const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f); - const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f); - const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f); - const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f); - const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f); - - const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g); - const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g); - const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g); - const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g); - const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g); - const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g); + const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b); + const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b); + const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b); + const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b); + const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b); + const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b); + + const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c); + const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c); + const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c); + const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c); + const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c); + const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c); + + const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d); + const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d); + const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d); + const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d); + const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d); + const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d); + + const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e); + const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e); + const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e); + const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e); + const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e); + const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e); + + const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f); + const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f); + const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f); + const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f); + const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f); + const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f); + + const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g); + const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g); + const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g); + const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g); + const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g); + const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g); x1 = reduce_sum_7(a, b, c, d, e, f, g); x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0); @@ -271,9 +304,20 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5); } -void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2, +void fft_8(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + float32x2_t &x6, + float32x2_t &x7, + float32x2_t &x8, + const float32x2_t &w, + const float32x2_t &w2, const float32x2_t &w3, - const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6, + const float32x2_t &w4, + const float32x2_t &w5, + const float32x2_t &w6, const float32x2_t &w7) { const auto a = x1; @@ -285,61 +329,61 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto g = c_mul_neon(w6, x7); const auto h = c_mul_neon(w7, x8); - const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b); - const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b); - const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b); - const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b); - const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b); - const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b); - - const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c); - const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c); - const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c); - const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c); - const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c); - const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c); - - const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d); - const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d); - const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d); - const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d); - const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d); - - const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e); - - const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f); - const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f); - const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f); - const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f); - const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f); - const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f); - const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f); - - const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g); - const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g); - const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g); - const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g); - const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g); - const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g); - const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g); - - const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h); - const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h); - const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h); - const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h); - const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h); - const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h); - const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h); + const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b); + const auto b1 = c_mul_neon(float32x2_t{0, -1}, b); + const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b); + const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b); + const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b); + const auto b5 = c_mul_neon(float32x2_t{0, 1}, b); + const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b); + + const auto c0 = c_mul_neon(float32x2_t{0, -1}, c); + const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c); + const auto c2 = c_mul_neon(float32x2_t{0, 1}, c); + const auto c3 = c_mul_neon(float32x2_t{1, 0}, c); + const auto c4 = c_mul_neon(float32x2_t{0, -1}, c); + const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c); + const auto c6 = c_mul_neon(float32x2_t{0, 1}, c); + + const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d); + const auto d1 = c_mul_neon(float32x2_t{0, 1}, d); + const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d); + const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d); + const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d); + const auto d5 = c_mul_neon(float32x2_t{0, -1}, d); + const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d); + + const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e1 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e3 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e5 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e); + + const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f); + const auto f1 = c_mul_neon(float32x2_t{0, -1}, f); + const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f); + const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f); + const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f); + const auto f5 = c_mul_neon(float32x2_t{0, 1}, f); + const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f); + + const auto g0 = c_mul_neon(float32x2_t{0, 1}, g); + const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g); + const auto g2 = c_mul_neon(float32x2_t{0, -1}, g); + const auto g3 = c_mul_neon(float32x2_t{1, 0}, g); + const auto g4 = c_mul_neon(float32x2_t{0, 1}, g); + const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g); + const auto g6 = c_mul_neon(float32x2_t{0, -1}, g); + + const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h); + const auto h1 = c_mul_neon(float32x2_t{0, 1}, h); + const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h); + const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h); + const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h); + const auto h5 = c_mul_neon(float32x2_t{0, -1}, h); + const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h); x1 = reduce_sum_8(a, b, c, d, e, f, g, h); x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0); @@ -352,18 +396,19 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f } template -void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_2_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - auto a = float32x2_t{ 0, 0 }; - auto b = float32x2_t{ 0, 0 }; + auto a = float32x2_t{0, 0}; + auto b = float32x2_t{0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); a = wrapper::vgetlow(ab); @@ -379,7 +424,7 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_2(a, b, w); // Write outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); } @@ -394,12 +439,20 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_2_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -418,20 +471,21 @@ void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template -void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_3_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { // Load inputs - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - if(first_stage) + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + if (first_stage) { const auto ab = wrapper::vloadq(in + k); a = wrapper::vgetlow(ab); @@ -447,7 +501,7 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_3(a, b, c, w, w2); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); } @@ -462,14 +516,22 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_3_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -489,21 +551,22 @@ void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template -void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_4_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); const auto w3 = c_mul_neon(w2, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - if(first_stage) + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -524,7 +587,7 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_4(a, b, c, d, w, w2, w3); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -542,15 +605,23 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_4_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); const auto w3 = c_mul_neon(w2, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -572,25 +643,26 @@ void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template -void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_5_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); const float32x2_t w4 = c_mul_neon(w3, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -613,7 +685,7 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_5(a, b, c, d, e, w, w2, w3, w4); // Store outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -632,16 +704,24 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_5_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); const float32x2_t w4 = c_mul_neon(w3, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -666,10 +746,11 @@ void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template -void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_7_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -677,18 +758,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w5 = c_mul_neon(w4, w); const float32x2_t w6 = c_mul_neon(w5, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; - float32x2_t f = { 0, 0 }; - float32x2_t g = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; + float32x2_t f = {0, 0}; + float32x2_t g = {0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -715,7 +796,7 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -737,10 +818,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_7_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -748,7 +837,7 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w5 = c_mul_neon(w4, w); const float32x2_t w6 = c_mul_neon(w5, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -777,10 +866,11 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template -void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_8_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -789,20 +879,20 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w6 = c_mul_neon(w5, w); const float32x2_t w7 = c_mul_neon(w6, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { // Load inputs - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; - float32x2_t f = { 0, 0 }; - float32x2_t g = { 0, 0 }; - float32x2_t h = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; + float32x2_t f = {0, 0}; + float32x2_t g = {0, 0}; + float32x2_t h = {0, 0}; // Base-case prime transform - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -834,7 +924,7 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7); // Store outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -858,10 +948,18 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_8_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -870,7 +968,7 @@ void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w6 = c_mul_neon(w5, w); const float32x2_t w7 = c_mul_neon(w6, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -908,7 +1006,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_UNUSED(config); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -917,11 +1015,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) +std::pair +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) { ARM_COMPUTE_UNUSED(config); - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output, *input); } @@ -942,7 +1041,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo // FFT table axis 0: [radix, first_stage] static std::map> fft_table_axis0; - if(fft_table_axis0.empty()) + if (fft_table_axis0.empty()) { fft_table_axis0[2][false] = &fft_radix_2_axes_0; fft_table_axis0[3][false] = &fft_radix_3_axes_0; @@ -967,7 +1066,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo // FFT table axis 1: [radix, first_stage] static std::map fft_table_axis1; - if(fft_table_axis1.empty()) + if (fft_table_axis1.empty()) { fft_table_axis1[2] = &fft_radix_2_axes_1; fft_table_axis1[3] = &fft_radix_3_axes_1; @@ -985,12 +1084,13 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT ARM_COMPUTE_ERROR_ON_NULLPTR(input); // Output auto inizialitation if not yet initialized - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output->info(), *input->info()->clone()); } - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); _input = input; _output = (output == nullptr) ? input : output; @@ -998,7 +1098,7 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT _axis = config.axis; _radix = config.radix; - switch(config.axis) + switch (config.axis) { case 0: set_radix_stage_axis0(config); @@ -1012,26 +1112,28 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT } // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config); + auto win_config = + validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); } -Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config) +Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const FFTRadixStageKernelInfo &config) { const bool run_in_place = (output == nullptr) || (output == input); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (run_in_place) ? nullptr : output->clone().get(), - config) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config) + .first); return Status{}; } std::set NEFFTRadixStageKernel::supported_radix() { - return std::set { 2, 3, 4, 5, 7, 8 }; + return std::set{2, 3, 4, 5, 7, 8}; } void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info) @@ -1049,28 +1151,32 @@ void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info) // Precompute FFT constants const unsigned int NxRadix = _radix * _Nx; const float alpha = 2.0f * kPi / float(NxRadix); - const float32x2_t w_m{ cosf(alpha), -sinf(alpha) }; + const float32x2_t w_m{cosf(alpha), -sinf(alpha)}; - if(_axis == 0) + if (_axis == 0) { const unsigned int N = _input->info()->dimension(0); - execute_window_loop(input_window, [&](const Coordinates &) - { - _func_0(reinterpret_cast(out.ptr()), reinterpret_cast(in.ptr()), _Nx, NxRadix, w_m, N); - }, - in, out); + execute_window_loop( + input_window, + [&](const Coordinates &) { + _func_0(reinterpret_cast(out.ptr()), reinterpret_cast(in.ptr()), _Nx, NxRadix, w_m, + N); + }, + in, out); } else { const unsigned int N = _input->info()->dimension(0); const unsigned int M = _input->info()->dimension(1); - execute_window_loop(input_window, [&](const Coordinates &) - { - _func_1(reinterpret_cast(out.ptr()), reinterpret_cast(in.ptr()), _Nx, NxRadix, w_m, N, M, - _input->info()->padding().right + _input->info()->padding().left, - _output->info()->padding().right + _output->info()->padding().left); - }, - in, out); + execute_window_loop( + input_window, + [&](const Coordinates &) + { + _func_1(reinterpret_cast(out.ptr()), reinterpret_cast(in.ptr()), _Nx, NxRadix, w_m, N, + M, _input->info()->padding().right + _input->info()->padding().left, + _output->info()->padding().right + _output->info()->padding().left); + }, + in, out); } ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.h b/src/core/NEON/kernels/NEFFTRadixStageKernel.h index 2291a1068c..54f32efa23 100644 --- a/src/core/NEON/kernels/NEFFTRadixStageKernel.h +++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/NEON/INEKernel.h" #include @@ -92,8 +93,17 @@ private: void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config); void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config); - using FFTFunctionPointerAxis0 = std::function; - using FFTFunctionPointerAxis1 = std::function; + using FFTFunctionPointerAxis0 = + std::function; + using FFTFunctionPointerAxis1 = std::function; FFTFunctionPointerAxis0 _func_0; FFTFunctionPointerAxis1 _func_1; diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp index 5ec330bebc..9fe561fc59 100644 --- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp +++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp @@ -28,9 +28,10 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include @@ -41,8 +42,8 @@ namespace void scale_complex(float *c_in, float *c_out, bool is_conjugate, float scale) { const auto a = wrapper::vload(c_in); - auto b = wrapper::vdiv(a, float32x2_t{ scale, scale }); - if(is_conjugate) + auto b = wrapper::vdiv(a, float32x2_t{scale, scale}); + if (is_conjugate) { const float img_part = wrapper::vgetlane(b, 1); b = wrapper::vsetlane(-img_part, b, 1); @@ -56,7 +57,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -71,7 +72,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen // Configure kernel window Window win = calculate_max_window(*input, Steps()); - if(output != nullptr) + if (output != nullptr) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, *input->clone()); @@ -126,10 +127,10 @@ void NEFFTScaleKernel::run(const Window &window, const ThreadInfo &info) Iterator in(_input, input_window); Iterator out(_run_in_place ? _input : _output, input_window); - execute_window_loop(window, [&](const Coordinates &) - { - scale_complex(reinterpret_cast(in.ptr()), reinterpret_cast(out.ptr()), _is_conj, _scale); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &) + { scale_complex(reinterpret_cast(in.ptr()), reinterpret_cast(out.ptr()), _is_conj, _scale); }, + in, out); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.h b/src/core/NEON/kernels/NEFFTScaleKernel.h index 24a19f98ba..608cf5ea34 100644 --- a/src/core/NEON/kernels/NEFFTScaleKernel.h +++ b/src/core/NEON/kernels/NEFFTScaleKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H #define ARM_COMPUTE_NEFFTSCALEKERNEL_H -#include "src/core/NEON/INEKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/NEON/INEKernel.h" + namespace arm_compute { // Forward declarations diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp index 1c7c1f9763..00b0c0ae8d 100644 --- a/src/core/NEON/kernels/NEFillBorderKernel.cpp +++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp @@ -30,14 +30,19 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" namespace arm_compute { namespace { -inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value) +inline void fill_constant_value_single_channel_special(ITensor *tensor, + const Window &window, + unsigned int right, + unsigned int bottom, + const PixelValue &constant_border_value) { float border_value; constant_border_value.get(border_value); @@ -52,39 +57,43 @@ inline void fill_constant_value_single_channel_special(ITensor *tensor, const Wi Iterator vertical_it(tensor, vertical); - execute_window_loop(vertical, [&](const Coordinates &) - { - const auto row_start = reinterpret_cast(start_valid_region + vertical_it.offset()); + execute_window_loop( + vertical, + [&](const Coordinates &) + { + const auto row_start = reinterpret_cast(start_valid_region + vertical_it.offset()); - // Fill left and right borders - *(row_start - 1) = border_value; - std::fill_n(row_start + width, right, border_value); - }, - vertical_it); + // Fill left and right borders + *(row_start - 1) = border_value; + std::fill_n(row_start + width, right, border_value); + }, + vertical_it); // Top and bottom border Iterator plane_it(tensor, window); // Iterate over all XY planes - execute_window_loop(window, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + plane_it.offset(); - // Top border - const auto row_start = reinterpret_cast(base_addr - stridey); - // Fill top rows including left/right borders - std::fill_n(row_start - 1, 1 + width + right, border_value); - - // Bottom border - const unsigned low_border_size = height + bottom; - for(unsigned int i = height; i < low_border_size; ++i) + execute_window_loop( + window, + [&](const Coordinates &) { - const auto row_start = reinterpret_cast(base_addr + i * stridey); - - // Fill bottom rows including left/right borders + uint8_t *base_addr = start_valid_region + plane_it.offset(); + // Top border + const auto row_start = reinterpret_cast(base_addr - stridey); + // Fill top rows including left/right borders std::fill_n(row_start - 1, 1 + width + right, border_value); - } - }, - plane_it); + + // Bottom border + const unsigned low_border_size = height + bottom; + for (unsigned int i = height; i < low_border_size; ++i) + { + const auto row_start = reinterpret_cast(base_addr + i * stridey); + + // Fill bottom rows including left/right borders + std::fill_n(row_start - 1, 1 + width + right, border_value); + } + }, + plane_it); } } // namespace @@ -93,14 +102,20 @@ NEFillBorderKernel::NEFillBorderKernel() { } -void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void NEFillBorderKernel::configure(ITensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); _tensor = tensor; configure(tensor->info(), border_size, border_mode, constant_border_value); } -void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void NEFillBorderKernel::configure(ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. @@ -124,7 +139,7 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_UNUSED(info); // If there is no border: early exit - if(_border_size.empty()) + if (_border_size.empty()) { return; } @@ -132,13 +147,14 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_mode) + switch (_mode) { case BorderMode::CONSTANT: { - if(_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32) + if (_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32) { - fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value); + fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, + _constant_border_value); } else { @@ -176,46 +192,56 @@ void NEFillBorderKernel::fill_replicate_single_channel(const Window &window) Iterator vertical_it(_tensor, vertical); - execute_window_loop(vertical, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + vertical_it.offset(); - // Fill left and right borders - for(unsigned int i = 0; i < _border_size.left; ++i) + execute_window_loop( + vertical, + [&](const Coordinates &) { - std::memcpy(base_addr + static_cast(i - _border_size.left) * element_size, vertical_it.ptr(), element_size); - } + uint8_t *base_addr = start_valid_region + vertical_it.offset(); + // Fill left and right borders + for (unsigned int i = 0; i < _border_size.left; ++i) + { + std::memcpy(base_addr + static_cast(i - _border_size.left) * element_size, vertical_it.ptr(), + element_size); + } - for(unsigned int i = 0; i < _border_size.right; ++i) - { - std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, element_size); - } - }, - vertical_it); + for (unsigned int i = 0; i < _border_size.right; ++i) + { + std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, + element_size); + } + }, + vertical_it); // Top and bottom border Iterator plane_it(_tensor, window); // Iterate over all XY planes - execute_window_loop(window, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + plane_it.offset(); - // Top border - for(int i = -_border_size.top; i < 0; ++i) + execute_window_loop( + window, + [&](const Coordinates &) { - // Copy top rows including left/right borders - std::memcpy(base_addr + i * static_cast(_tensor->info()->strides_in_bytes()[1]) - _border_size.left * element_size, - base_addr - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size); - } + uint8_t *base_addr = start_valid_region + plane_it.offset(); + // Top border + for (int i = -_border_size.top; i < 0; ++i) + { + // Copy top rows including left/right borders + std::memcpy(base_addr + i * static_cast(_tensor->info()->strides_in_bytes()[1]) - + _border_size.left * element_size, + base_addr - _border_size.left * element_size, + (_border_size.left + width + _border_size.right) * element_size); + } - // Bottom border - for(unsigned int i = height; i < height + _border_size.bottom; ++i) - { - // Copy bottom rows including left/right borders - std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, - base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size); - } - }, - plane_it); + // Bottom border + for (unsigned int i = height; i < height + _border_size.bottom; ++i) + { + // Copy bottom rows including left/right borders + std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, + base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - + _border_size.left * element_size, + (_border_size.left + width + _border_size.right) * element_size); + } + }, + plane_it); } void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window) @@ -232,50 +258,57 @@ void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window Iterator vertical_it(_tensor, vertical); - execute_window_loop(vertical, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + vertical_it.offset(); - // Fill left and right borders - for(unsigned int i = 0; i < _border_size.left; ++i) + execute_window_loop( + vertical, + [&](const Coordinates &) { - std::memcpy(base_addr + static_cast(i - _border_size.left) * element_size, &_constant_border_value, element_size); - } + uint8_t *base_addr = start_valid_region + vertical_it.offset(); + // Fill left and right borders + for (unsigned int i = 0; i < _border_size.left; ++i) + { + std::memcpy(base_addr + static_cast(i - _border_size.left) * element_size, &_constant_border_value, + element_size); + } - for(unsigned int i = 0; i < _border_size.right; ++i) - { - std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size); - } - }, - vertical_it); + for (unsigned int i = 0; i < _border_size.right; ++i) + { + std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size); + } + }, + vertical_it); // Top and bottom border Iterator plane_it(_tensor, window); // Iterate over all XY planes - execute_window_loop(window, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + plane_it.offset(); - // Top border - for(int i = -_border_size.top; i < 0; ++i) + execute_window_loop( + window, + [&](const Coordinates &) { - // Fill top rows including left/right borders - for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + uint8_t *base_addr = start_valid_region + plane_it.offset(); + // Top border + for (int i = -_border_size.top; i < 0; ++i) { - std::memcpy(base_addr + i * stridey + static_cast(j - _border_size.left) * element_size, &_constant_border_value, element_size); + // Fill top rows including left/right borders + for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + { + std::memcpy(base_addr + i * stridey + static_cast(j - _border_size.left) * element_size, + &_constant_border_value, element_size); + } } - } - // Bottom border - const unsigned low_border_size = height + _border_size.bottom; - for(unsigned int i = height; i < low_border_size; ++i) - { - // Fill bottom rows including left/right borders - for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + // Bottom border + const unsigned low_border_size = height + _border_size.bottom; + for (unsigned int i = height; i < low_border_size; ++i) { - std::memcpy(base_addr + i * stridey + static_cast(j - _border_size.left) * element_size, &_constant_border_value, element_size); + // Fill bottom rows including left/right borders + for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + { + std::memcpy(base_addr + i * stridey + static_cast(j - _border_size.left) * element_size, + &_constant_border_value, element_size); + } } - } - }, - plane_it); + }, + plane_it); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h index 2c851583ed..aaad108bfa 100644 --- a/src/core/NEON/kernels/NEFillBorderKernel.h +++ b/src/core/NEON/kernels/NEFillBorderKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -64,7 +65,10 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. * */ - void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(ITensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Initialise the function. * * @note This kernel fills the borders within the XY-planes. @@ -75,7 +79,10 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. * */ - void configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp index 51a69046a9..cbe5136fb1 100644 --- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h" -#include "src/cpu/kernels/fuse_batch_normalization/list.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" @@ -30,12 +29,14 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/common/cpuinfo/CpuIsaInfo.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/fuse_batch_normalization/list.h" #include @@ -52,8 +53,16 @@ struct FuseBatchNormalizeSelectorData }; using FBNSelectorPtr = std::add_pointer::type; -using FBNUKernelPtr = std::add_pointer::type; +using FBNUKernelPtr = std::add_pointer::type; struct FBNUKernel { @@ -62,73 +71,63 @@ struct FBNUKernel FBNUKernelPtr ukernel; }; -static const FBNUKernel available_kernels[] = -{ - { - "fused_batch_normalization_conv_NHWC_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16) - }, - { - "fused_batch_normalization_conv_NCHW_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16) - }, - { - "fused_batch_normalization_dwc_NHWC_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16) - }, - { - "fused_batch_normalization_dwc_NCHW_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16) - }, - { - "fused_batch_normalization_conv_NHWC_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32) - }, - { - "fused_batch_normalization_conv_NCHW_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32) - }, - { - "fused_batch_normalization_dwc_NHWC_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32) - }, - { - "fused_batch_normalization_dwc_NCHW_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32) - } -}; +static const FBNUKernel available_kernels[] = { + {"fused_batch_normalization_conv_NHWC_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)}, + {"fused_batch_normalization_conv_NCHW_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)}, + {"fused_batch_normalization_dwc_NHWC_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)}, + {"fused_batch_normalization_dwc_NCHW_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)}, + {"fused_batch_normalization_conv_NHWC_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)}, + {"fused_batch_normalization_conv_NCHW_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)}, + {"fused_batch_normalization_dwc_NHWC_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)}, + {"fused_batch_normalization_dwc_NCHW_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)}}; /** Micro-kernel selector * @@ -140,9 +139,9 @@ static const FBNUKernel available_kernels[] = */ const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -150,10 +149,16 @@ const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data) return nullptr; } -Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status validate_arguments(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_UNUSED(epsilon); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); @@ -164,43 +169,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr); ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1); - if(fbn_type == FuseBatchNormalizationType::CONVOLUTION) + if (fbn_type == FuseBatchNormalizationType::CONVOLUTION) { ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0)); } else { - const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); + const size_t channel_idx = + get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0)); } // Validate bias - if(input_bias != nullptr) + if (input_bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias); } // Validate beta - if(bn_beta != nullptr) + if (bn_beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta); } // Validate gamma - if(bn_gamma != nullptr) + if (bn_gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma); } // Validate output weights - if(fused_weights != nullptr && fused_weights->total_size() != 0) + if (fused_weights != nullptr && fused_weights->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights); } // Validate output bias - if(fused_bias != nullptr && fused_bias->total_size() != 0) + if (fused_bias != nullptr && fused_bias->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias); @@ -212,15 +218,31 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b } // namespace NEFuseBatchNormalizationKernel::NEFuseBatchNormalizationKernel() - : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(), - _run_in_place_weights(false), _run_in_place_bias(false), _func(nullptr) + : _input_weights(nullptr), + _input_bias(nullptr), + _bn_mean(nullptr), + _bn_var(nullptr), + _bn_gamma(nullptr), + _bn_beta(nullptr), + _fused_weights(nullptr), + _fused_bias(nullptr), + _epsilon(), + _run_in_place_weights(false), + _run_in_place_bias(false), + _func(nullptr) { } -void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, - ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); @@ -238,27 +260,27 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con _run_in_place_bias = (fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias); // Auto initialize outputs - if(_fused_weights != nullptr) + if (_fused_weights != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone()); } - if(_fused_bias != nullptr) + if (_fused_bias != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone()); } // Validate arguments - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(), - (fused_weights != nullptr) ? fused_weights->info() : nullptr, - (fused_bias != nullptr) ? fused_bias->info() : nullptr, - (input_bias != nullptr) ? input_bias->info() : nullptr, - (bn_beta != nullptr) ? bn_beta->info() : nullptr, - (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, - epsilon, fbn_type)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input_weights->info(), bn_mean->info(), bn_var->info(), + (fused_weights != nullptr) ? fused_weights->info() : nullptr, + (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr, + (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon, + fbn_type)); - const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{ input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa() }); + const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{ + input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); _func = uk->ukernel; @@ -268,12 +290,19 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con INEKernel::configure(win); } -Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); return Status{}; } @@ -284,6 +313,7 @@ void NEFuseBatchNormalizationKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, window); + (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, + window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h index ee767b01c8..f23280d55a 100644 --- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h +++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h @@ -66,9 +66,16 @@ public: * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION. */ - void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + void configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias = nullptr, + const ITensor *bn_beta = nullptr, + const ITensor *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel * * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC @@ -86,10 +93,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + static Status validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias = nullptr, + const ITensorInfo *bn_beta = nullptr, + const ITensorInfo *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -107,8 +120,16 @@ private: bool _run_in_place_weights; bool _run_in_place_bias; - using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window); + using FuseBatchNormFunction = void(const ITensor *input_weights, + const ITensor *input_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window); FuseBatchNormFunction *_func; }; diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp index 11332ffac8..f1d457d399 100644 --- a/src/core/NEON/kernels/NEGatherKernel.cpp +++ b/src/core/NEON/kernels/NEGatherKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,20 +43,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - if(axis < 0) + if (axis < 0) { axis += input->num_dimensions(); } ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast(input->num_dimensions())); - ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > Coordinates::num_max_dimensions); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > + Coordinates::num_max_dimensions); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); - TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), axis); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->tensor_shape(), indices->tensor_shape(), axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } @@ -81,23 +84,23 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info) const auto idx_info = _indices->info(); const auto dst_info = _output->info(); - const auto num_dims = dst_info->num_dimensions(); + const auto num_dims = dst_info->num_dimensions(); const auto chunk_stride = src_info->strides_in_bytes()[_axis]; const auto window_start_x = window.x().start(); - const auto window_end_x = window.x().end(); - auto window_size_x = src_info->element_size(); + const auto window_end_x = window.x().end(); + auto window_size_x = src_info->element_size(); const auto idx_limit = static_cast(src_info->tensor_shape()[_axis]); - if(_axis != 0) + if (_axis != 0) { dst_win.set(0, Window::Dimension(window_start_x, window_start_x + 1, 1)); window_size_x *= window_end_x - window_start_x; } // Compute source and index tensors window based on the output window. - auto src_win = dst_win; + auto src_win = dst_win; Window idx_win; for (size_t i = 0; i < idx_info->num_dimensions(); ++i) @@ -109,22 +112,27 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info) // Use the custom strides to access all three tensors using the same loop. Iterator src_it(num_dims, _src_it_strides, _input->buffer(), src_info->offset_first_element_in_bytes(), src_win); Iterator idx_it(num_dims, _idx_it_strides, _indices->buffer(), idx_info->offset_first_element_in_bytes(), idx_win); - Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(), dst_info->offset_first_element_in_bytes(), dst_win); - - execute_window_loop(dst_win, [&](const Coordinates &) { - const auto idx = *reinterpret_cast(idx_it.ptr()); - - if(idx >= 0 && idx < idx_limit) - { - const auto src_ptr = src_it.ptr() + idx * chunk_stride; + Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(), + dst_info->offset_first_element_in_bytes(), dst_win); - std::copy_n(src_ptr, window_size_x, dst_it.ptr()); - } - else + execute_window_loop( + dst_win, + [&](const Coordinates &) { - std::fill_n(dst_it.ptr(), window_size_x, 0); - } - }, src_it, idx_it, dst_it); + const auto idx = *reinterpret_cast(idx_it.ptr()); + + if (idx >= 0 && idx < idx_limit) + { + const auto src_ptr = src_it.ptr() + idx * chunk_stride; + + std::copy_n(src_ptr, window_size_x, dst_it.ptr()); + } + else + { + std::fill_n(dst_it.ptr(), window_size_x, 0); + } + }, + src_it, idx_it, dst_it); } void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) @@ -137,13 +145,13 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe _output = output; _axis = axis; - if(_axis < 0) + if (_axis < 0) { _axis += input->info()->num_dimensions(); } ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast(input->info()->num_dimensions())); - switch(_indices->info()->data_type()) + switch (_indices->info()->data_type()) { case DataType::U32: _func = &NEGatherKernel::gather_common; @@ -157,7 +165,8 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe } // Output auto initialization if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); // Create window @@ -169,30 +178,31 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe // These will be used to iterate lock-step through all tensors (input, indices and output). size_t dim_no = 0; - const auto input_info = input->info(); + const auto input_info = input->info(); const auto &input_strides = input_info->strides_in_bytes(); - const auto indices_info = indices->info(); - const auto &indices_strides = indices_info->strides_in_bytes(); - const auto indices_num_dims = indices_info->num_dimensions(); + const auto indices_info = indices->info(); + const auto &indices_strides = indices_info->strides_in_bytes(); + const auto indices_num_dims = indices_info->num_dimensions(); - for(; dim_no < static_cast(_axis); ++dim_no) + for (; dim_no < static_cast(_axis); ++dim_no) { _src_it_strides[dim_no] = input_strides[dim_no]; } - for(; dim_no < static_cast(_axis) + indices_num_dims; ++dim_no) + for (; dim_no < static_cast(_axis) + indices_num_dims; ++dim_no) { _idx_it_strides[dim_no] = indices_strides[dim_no - _axis]; } - for(; dim_no < Coordinates::num_max_dimensions; ++dim_no) + for (; dim_no < Coordinates::num_max_dimensions; ++dim_no) { _src_it_strides[dim_no] = input_strides[dim_no - indices_num_dims + 1]; } } -Status NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) +Status +NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis)); return Status{}; diff --git a/src/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h index ce69daeda7..b8c069f99e 100644 --- a/src/core/NEON/kernels/NEGatherKernel.h +++ b/src/core/NEON/kernels/NEGatherKernel.h @@ -26,6 +26,7 @@ #define ARM_COMPUTE_NEGATHERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -92,8 +93,8 @@ private: ITensor *_output; kernel_ptr _func; - Strides _src_it_strides; - Strides _idx_it_strides; + Strides _src_it_strides; + Strides _idx_it_strides; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEGATHERKERNEL_H */ diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp index 7bba136e84..549319e49f 100644 --- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp +++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp @@ -27,11 +27,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/genproposals/list.h" + #include namespace arm_compute @@ -44,7 +46,8 @@ struct ComputeAllAnchorsData }; using ComputeAllAnchorsSelectorPtr = std::add_pointer::type; -using ComputeAllAnchorsUKernelPtr = std::add_pointer::type; +using ComputeAllAnchorsUKernelPtr = std::add_pointer::type; struct ComputeAllAnchorsKernel { @@ -53,27 +56,17 @@ struct ComputeAllAnchorsKernel ComputeAllAnchorsUKernelPtr ukernel; }; -static const ComputeAllAnchorsKernel available_kernels[] = -{ +static const ComputeAllAnchorsKernel available_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_qu16_computeallanchors", - [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors) - }, + {"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)}, #endif //defined(ARM_COMPUTE_ENABLE_NEON) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "neon_fp16_computeallanchors", - [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors) - }, + {"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "neon_fp32_computeallanchors", - [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors) - }, + {"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)}, }; /** Micro-kernel selector @@ -84,9 +77,9 @@ static const ComputeAllAnchorsKernel available_kernels[] = */ const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -101,7 +94,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2); - if(all_anchors->total_size() > 0) + if (all_anchors->total_size() > 0) { const size_t feature_height = info.feat_height(); const size_t feature_width = info.feat_width(); @@ -111,7 +104,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors); - if(is_data_type_quantized(anchors->data_type())) + if (is_data_type_quantized(anchors->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors); } @@ -139,7 +132,8 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a // Initialize the output if empty const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors); - auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); + auto_init_if_empty(*all_anchors->info(), + TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); // Set instance variables _anchors = anchors; @@ -151,7 +145,9 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a INEKernel::configure(win); } -Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info) +Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, + const ITensorInfo *all_anchors, + const ComputeAnchorsInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info)); return Status{}; @@ -163,7 +159,7 @@ void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() }); + const auto *uk = get_implementation(ComputeAllAnchorsData{_anchors->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_anchors, _all_anchors, _anchors_info, window); diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h index 297d6d4abe..30699eee01 100644 --- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h +++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h @@ -78,5 +78,5 @@ private: ITensor *_all_anchors; ComputeAnchorsInfo _anchors_info; }; -} // arm_compute +} // namespace arm_compute #endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp index 71641404bf..0a1780f6ee 100644 --- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp @@ -31,12 +31,13 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/instancenorm/list.h" #include @@ -51,7 +52,13 @@ struct InstanceNormSelectorData }; using InstanceNormSelctorPtr = std::add_pointer::type; -using InstanceNormUKernelPtr = std::add_pointer::type; +using InstanceNormUKernelPtr = std::add_pointer::type; struct InstanceNormKernel { @@ -60,19 +67,12 @@ struct InstanceNormKernel InstanceNormUKernelPtr ukernel; }; -static const InstanceNormKernel available_kernels[] = -{ - { - "fp32_neon_instancenorm", - [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm) - }, +static const InstanceNormKernel available_kernels[] = { + {"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_instancenorm", - [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm) - }, + {"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC }; @@ -84,9 +84,9 @@ static const InstanceNormKernel available_kernels[] = */ const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -102,14 +102,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "NHWC data layout is not supported by the kernel directly"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, + "NHWC data layout is not supported by the kernel directly"); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); } return Status{}; } @@ -132,7 +134,9 @@ NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel() { } -void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info) +void NEInstanceNormalizationLayerKernel::configure(ITensor *input, + ITensor *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -152,10 +156,13 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp INEKernel::configure(std::get<1>(win_config)); } -Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info) +Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info.gamma, info.beta, info.epsilon)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); return Status{}; } @@ -165,7 +172,7 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() }); + const auto *uk = get_implementation(InstanceNormSelectorData{_input->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window); diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h index f166ce2058..024ccd9ef2 100644 --- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h @@ -68,7 +68,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -82,14 +83,15 @@ private: * @param[in] beta The offset scalar value applied to the normalized tensor. Defaults to 0.0 * @param[in] epsilon Lower bound value for the normalization. Defaults to 1e-12 */ - using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); + using NormalizationFunction = + void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); ITensor *_input; ITensor *_output; float _gamma; float _beta; float _epsilon; - bool _use_mixed_precision{ true }; + bool _use_mixed_precision{true}; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */ diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp index 8ab0288ab1..eea57a17d3 100644 --- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp +++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp @@ -30,11 +30,12 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/common/cpuinfo/CpuIsaInfo.h" -#include "src/core/NEON/NEMath.h" #include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEMath.h" #include "src/cpu/kernels/l2normlayer/list.h" #include @@ -55,7 +56,8 @@ struct L2NormalizeLayerSelectorData using L2NormalizeLayerKernelSelctorPtr = std::add_pointer::type; -using L2NormalizeLayerPtr = std::add_pointer::type; +using L2NormalizeLayerPtr = std::add_pointer::type; struct L2NormalizeLayerKernel { @@ -64,26 +66,25 @@ struct L2NormalizeLayerKernel L2NormalizeLayerPtr ukernel; }; -static const L2NormalizeLayerKernel available_kernels[] = -{ - { - "fp32_neon_l2normalize_x", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x) - }, - { - "fp32_neon_l2normalize_yz", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz) - }, +static const L2NormalizeLayerKernel available_kernels[] = { + {"fp32_neon_l2normalize_x", + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)}, + {"fp32_neon_l2normalize_yz", + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)}, { "fp16_neon_l2normalize_x", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; }, + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; }, REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_x), }, { "fp16_neon_l2normalize_yz", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; }, + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; }, REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_yz), }, }; @@ -96,9 +97,9 @@ static const L2NormalizeLayerKernel available_kernels[] = */ const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -106,7 +107,8 @@ const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorD return nullptr; } -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_UNUSED(epsilon); @@ -115,14 +117,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, + "Actual normalization axis greater than max number of dimensions"); // Reduce shape on axis TensorShape sum_shape = input->tensor_shape(); sum_shape.set(actual_axis, 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -151,7 +154,8 @@ NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel() { } -void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon) +void NEL2NormalizeLayerKernel::configure( + const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon)); @@ -169,10 +173,12 @@ void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *su INEKernel::configure(std::get<1>(win_config)); } -Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status NEL2NormalizeLayerKernel::validate( + const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + ARM_COMPUTE_RETURN_ON_ERROR( + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); return Status{}; } @@ -183,12 +189,13 @@ void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - if(_actual_axis > 2) + if (_actual_axis > 2) { ARM_COMPUTE_ERROR("Unsupported normalization axis"); } - const auto *uk = get_implementation(L2NormalizeLayerSelectorData{ _output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa() }); + const auto *uk = get_implementation( + L2NormalizeLayerSelectorData{_output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr); ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h index af3ad3403e..3524e66a21 100644 --- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h +++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h @@ -74,7 +74,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); + static Status + validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NELogicalKernel.cpp b/src/core/NEON/kernels/NELogicalKernel.cpp index 6939e08ef0..6be6284528 100644 --- a/src/core/NEON/kernels/NELogicalKernel.cpp +++ b/src/core/NEON/kernels/NELogicalKernel.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,7 +51,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1); ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16))); src0 += step; @@ -58,7 +59,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8))); src0 += half_step; @@ -66,7 +67,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src0) && (*src1); ++src0; @@ -84,21 +85,21 @@ void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8 const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s); const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16)); src += step; dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8)); src += half_step; dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src) && broadcast_val_clamped_s; ++src; @@ -112,7 +113,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1); ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16))); src0 += step; @@ -120,7 +121,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8))); src0 += half_step; @@ -128,7 +129,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src0) || (*src1); ++src0; @@ -146,21 +147,21 @@ void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_ const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s); const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16)); src += step; dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8)); src += half_step; dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src) || broadcast_val_clamped_s; ++src; @@ -173,21 +174,21 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len) ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16)); src += step; dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8)); src += half_step; dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = !(*src); ++src; @@ -197,18 +198,15 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len) void run_unary(const Window &window, const ITensor *src, ITensor *dst) { - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); const auto len = window.x().end() - window.x().start(); Iterator in(src, win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - neon_logical_not(in.ptr(), out.ptr(), len); - }, - in, out); + execute_window_loop( + win, [&](const Coordinates &) { neon_logical_not(in.ptr(), out.ptr(), len); }, in, out); } void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op) @@ -216,16 +214,17 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); const auto len = window.x().end() - window.x().start(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { - using LogicalBroadcastUKernelPtr = std::add_pointer::type; - LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast; + using LogicalBroadcastUKernelPtr = std::add_pointer::type; + LogicalBroadcastUKernelPtr logical_func = + op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast; const bool is_broadcast_input_1 = src1_win.x().step() == 0; Window broadcast_win = is_broadcast_input_1 ? src1_win : src0_win; @@ -238,17 +237,18 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const uint8_t broadcast_value = *broadcast_in.ptr(); - logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len); - - }, - broadcast_in, non_broadcast_in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const uint8_t broadcast_value = *broadcast_in.ptr(); + logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len); + }, + broadcast_in, non_broadcast_in, out); } else { - using LogicalUKernelPtr = std::add_pointer::type; + using LogicalUKernelPtr = std::add_pointer::type; LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and; src0_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -257,11 +257,8 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, Iterator in0(src0, src0_win); Iterator in1(src1, src1_win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); - }, - in0, in1, out); + execute_window_loop( + win, [&](const Coordinates &) { logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); }, in0, in1, out); } } } // namespace @@ -270,7 +267,10 @@ const char *NELogicalKernel::name() const return "NELogicalKernel"; } -void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op) +void NELogicalKernel::configure(const ITensorInfo *input1, + const ITensorInfo *input2, + ITensorInfo *output, + LogicalOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output); ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op)); @@ -279,7 +279,7 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in Window win = calculate_max_window(*input1, Steps()); TensorShape out_shape = input1->tensor_shape(); - if(op != LogicalOperation::Not) + if (op != LogicalOperation::Not) { ARM_COMPUTE_ERROR_ON_NULLPTR(input2); out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); @@ -292,13 +292,16 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in set_data_type_if_unknown(*output, input1->data_type()); } -Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op) +Status NELogicalKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + LogicalOperation op) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); ARM_COMPUTE_RETURN_ERROR_ON(op == LogicalOperation::Unknown); TensorShape out_shape = input1->tensor_shape(); - if(op != LogicalOperation::Not) + if (op != LogicalOperation::Not) { out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); @@ -306,7 +309,7 @@ Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *i } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); @@ -326,7 +329,7 @@ void NELogicalKernel::run_op(ITensorPack &tensors, const Window &window, const T const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - if(_op == LogicalOperation::Not) + if (_op == LogicalOperation::Not) { run_unary(window, src0, dst); } diff --git a/src/core/NEON/kernels/NELogicalKernel.h b/src/core/NEON/kernels/NELogicalKernel.h index caf69cf45d..477a59d826 100644 --- a/src/core/NEON/kernels/NELogicalKernel.h +++ b/src/core/NEON/kernels/NELogicalKernel.h @@ -58,10 +58,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op); + static Status + validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp index 37e88a8565..451031d696 100644 --- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp @@ -28,12 +28,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/meanstddevnorm/list.h" namespace arm_compute @@ -46,7 +47,8 @@ struct MeanStdDevNormSelectorData }; using MeanStdDevNormSelctorPtr = std::add_pointer::type; -using MeanStdDevNormUKernelPtr = std::add_pointer::type; +using MeanStdDevNormUKernelPtr = + std::add_pointer::type; struct MeanStdDevNormKernel { @@ -55,25 +57,15 @@ struct MeanStdDevNormKernel MeanStdDevNormUKernelPtr ukernel; }; -static const std::vector available_kernels = -{ - { - "fp32_neon_meanstddevnorm", - [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm) - }, +static const std::vector available_kernels = { + {"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_meanstddevnorm", - [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm) - }, + {"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "qasymm8_neon_meanstddevnorm", - [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm) - }, + {"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)}, }; /** Micro-kernel selector @@ -84,9 +76,9 @@ static const std::vector available_kernels = */ const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -103,7 +95,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -113,7 +105,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) { - if(output != nullptr) + if (output != nullptr) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output auto inizialitation if not yet initialized @@ -128,8 +120,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() - : _input(nullptr), _output(nullptr), _epsilon(1e-8f) +NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() : _input(nullptr), _output(nullptr), _epsilon(1e-8f) { } @@ -137,7 +128,8 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output, { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); + ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate( + input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); _input = input; _output = (output == nullptr) ? input : output; @@ -152,7 +144,9 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output, Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr) + .first); return Status{}; } @@ -162,7 +156,7 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() }); + const auto *uk = get_implementation(MeanStdDevNormSelectorData{_output->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_input, _output, _epsilon, window); diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp index 49a045382d..2c61bda147 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp @@ -29,19 +29,23 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/NormalizationHelpers.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *input_squared, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); @@ -52,7 +56,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -69,7 +73,10 @@ NENormalizationLayerKernel::NENormalizationLayerKernel() { } -void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info) +void NENormalizationLayerKernel::configure(const ITensor *input, + const ITensor *input_squared, + ITensor *output, + NormalizationLayerInfo norm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output); // Output tensor auto initialization if not yet initialized @@ -85,15 +92,15 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * _output = output; _norm_info = norm_info; - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { case DataType::F32: { - switch(norm_idx) + switch (norm_idx) { case 0: { - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float; } @@ -104,7 +111,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * break; } case 1: - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float; } @@ -124,11 +131,11 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { - switch(norm_idx) + switch (norm_idx) { case 0: { - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float; } @@ -139,7 +146,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * break; } case 1: - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float; } @@ -196,8 +203,9 @@ void NENormalizationLayerKernel::normalize_float(const Window &window) const auto beta_vec = wrapper::vdup_n(static_cast(_norm_info.beta()), ExactTagType{}); const auto kappa_vec = wrapper::vdup_n(static_cast(_norm_info.kappa()), ExactTagType{}); - auto sequential_normalization = [&](const int x, const Coordinates & id, const int current_row, const int first_row, const int last_row, const T * input_ptr, const uint8_t *input_squared_start_ptr, - T * output_ptr) + auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row, + const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr, + T *output_ptr) { const int current_slice = dim == 0 ? x : id[dim]; const int first_slice = std::max(current_slice - radius, 0); @@ -206,75 +214,87 @@ void NENormalizationLayerKernel::normalize_float(const Window &window) const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x; // Accumulate 2D In-Map values auto accu = static_cast(0.f); - for(int j = first_row; j <= last_row; ++j) + for (int j = first_row; j <= last_row; ++j) { // Compute row displacement const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row; - for(int i = first_slice; i <= last_slice; ++i) + for (int i = first_slice; i <= last_slice; ++i) { - accu += *reinterpret_cast(input_squared_ptr + (i - current_slice) * input_squared_stride_slice); + accu += + *reinterpret_cast(input_squared_ptr + (i - current_slice) * input_squared_stride_slice); } } // Normalize - const auto normalized = std::pow(accu * static_cast(_norm_info.scale_coeff()) + static_cast(_norm_info.kappa()), _norm_info.beta()); + const auto normalized = std::pow( + accu * static_cast(_norm_info.scale_coeff()) + static_cast(_norm_info.kappa()), _norm_info.beta()); const auto normalized_pixel = (*(input_ptr + x)) / normalized; *(output_ptr + x) = normalized_pixel; }; - execute_window_loop(win, [&](const Coordinates & id) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &id) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); - // Get range to normalize - const int current_row = do_2D_norm ? id[dim_y] : 0; - const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0; - const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; - int x = window_start_x; - // Compute serially starting elements for the case x dimension is width - for(; x < radius && x < window_end_x && dim == 0; ++x) - { - sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr); - } + int x = window_start_x; + // Compute serially starting elements for the case x dimension is width + for (; x < radius && x < window_end_x && dim == 0; ++x) + { + sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), + output_ptr); + } - // Compute vectorized - for(; x <= window_end_x - window_step_x - radius; x += window_step_x) - { - const int current_slice = dim == 0 ? x : id[dim]; - const int first_slice = std::max(current_slice - radius, 0); - const int last_slice = std::min(current_slice + radius, max_right); - - const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x; - // Accumulate 2D In-Map values - auto accu = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - for(int j = first_row; j <= last_row; ++j) + // Compute vectorized + for (; x <= window_end_x - window_step_x - radius; x += window_step_x) { - // Compute row displacement - const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row; - for(int i = first_slice; i <= last_slice; ++i) + const int current_slice = dim == 0 ? x : id[dim]; + const int first_slice = std::max(current_slice - radius, 0); + const int last_slice = std::min(current_slice + radius, max_right); + + const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x; + // Accumulate 2D In-Map values + auto accu = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + for (int j = first_row; j <= last_row; ++j) { - accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast(input_squared_ptr + (i - current_slice) * input_squared_stride_slice))); + // Compute row displacement + const uint8_t *const input_squared_ptr = + input_squared_x_ptr + (j - current_row) * input_squared_stride_row; + for (int i = first_slice; i <= last_slice; ++i) + { + accu = wrapper::vadd( + accu, wrapper::vloadq(reinterpret_cast( + input_squared_ptr + (i - current_slice) * input_squared_stride_slice))); + } } - } - // Normalize - const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec); - const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized)); - wrapper::vstore(reinterpret_cast(output_ptr + x), normalized_pixel); - } + // Normalize + const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec); + const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized)); + wrapper::vstore(reinterpret_cast(output_ptr + x), normalized_pixel); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr); - } - }, - input, input_squared, output); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), + output_ptr); + } + }, + input, input_squared, output); } -Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info) +Status NENormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *input_squared, + const ITensorInfo *output, + const NormalizationLayerInfo norm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info)); diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h index 53a06b9ed9..2d8d9f3d60 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h @@ -60,7 +60,8 @@ public: * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input. * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ - void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info); + void + configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info); /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], @@ -72,7 +73,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *input_squared, + const ITensorInfo *output, + NormalizationLayerInfo norm_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp index 734510b637..c9bcbc9127 100644 --- a/src/core/NEON/kernels/NEPadLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp @@ -28,26 +28,31 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &paddings, const PaddingMode mode) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &paddings, + const PaddingMode mode) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_MSG(mode != PaddingMode::CONSTANT, "Only constant padding mode is supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(paddings.size() > 4, "Padding list bigger than 4 dimensions"); - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings); - const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = + arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings); + const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -58,30 +63,34 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c template void NEPadLayerKernel::run_pad_constant(const Window &window) { - Window output_window{ window }; + Window output_window{window}; output_window.set(Window::DimX, Window::Dimension(0, 1, 1)); const size_t element_size = _input->info()->element_size(); Iterator output_it(_output, output_window); - execute_window_loop(output_window, [&](const Coordinates & id) - { - Coordinates idin{ id }; - for(size_t dim = _padding.size() - 1; dim > 0; --dim) + execute_window_loop( + output_window, + [&](const Coordinates &id) { - idin[dim] -= _padding[dim].first; - if(idin[dim] < 0 || static_cast(_input->info()->dimension(dim)) - 1 < idin[dim]) + Coordinates idin{id}; + for (size_t dim = _padding.size() - 1; dim > 0; --dim) { - std::fill_n(reinterpret_cast(output_it.ptr()), _output->info()->dimension(0), _constant_value.get()); - return; + idin[dim] -= _padding[dim].first; + if (idin[dim] < 0 || static_cast(_input->info()->dimension(dim)) - 1 < idin[dim]) + { + std::fill_n(reinterpret_cast(output_it.ptr()), _output->info()->dimension(0), + _constant_value.get()); + return; + } } - } - T *input_it_ptr = reinterpret_cast(_input->ptr_to_element(idin)); - T *output_it_ptr = reinterpret_cast(output_it.ptr()); - std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get()); - memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size); - std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, _constant_value.get()); - }, - output_it); + T *input_it_ptr = reinterpret_cast(_input->ptr_to_element(idin)); + T *output_it_ptr = reinterpret_cast(output_it.ptr()); + std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get()); + memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size); + std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, + _constant_value.get()); + }, + output_it); } void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window) @@ -92,7 +101,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window const size_t end_plane = window.z().end(); size_t start_plane_input = start_plane; - if(_padding.size() > 2) + if (_padding.size() > 2) { start_plane_input = (start_plane < _padding[2].first) ? 0 : start_plane - _padding[2].first; } @@ -105,18 +114,20 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window const size_t jump_to_next_row_input = _input->info()->dimension(0); const size_t jump_to_next_row_output = _padding[0].first + _padding[0].second; - uint8_t *output_row_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size; - const uint8_t *input_it_ptr = _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size; - const auto pad_value = _constant_value.get(); + uint8_t *output_row_ptr = + _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size; + const uint8_t *input_it_ptr = + _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size; + const auto pad_value = _constant_value.get(); - for(size_t z_i = start_plane; z_i < end_plane; ++z_i) + for (size_t z_i = start_plane; z_i < end_plane; ++z_i) { - if(_padding.size() > 2 && z_i < _padding[2].first) + if (_padding.size() > 2 && z_i < _padding[2].first) { memset(output_row_ptr, pad_value, output_plane_size); output_row_ptr += output_plane_size; } - else if(_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1)) + else if (_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1)) { memset(output_row_ptr, pad_value, output_plane_size); output_row_ptr += output_plane_size; @@ -127,7 +138,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window output_row_ptr += pad_y_elems_top; size_t y_i = _input->info()->dimension(1); // Basic loop unrolling - for(; y_i > 3; y_i -= 4) + for (; y_i > 3; y_i -= 4) { memset(output_row_ptr, pad_value, _padding[0].first); output_row_ptr += _padding[0].first; @@ -160,7 +171,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window memset(output_row_ptr, pad_value, _padding[0].second); output_row_ptr += _padding[0].second; } - for(; y_i > 0; --y_i) + for (; y_i > 0; --y_i) { memset(output_row_ptr, pad_value, _padding[0].first); output_row_ptr += _padding[0].first; @@ -183,12 +194,17 @@ NEPadLayerKernel::NEPadLayerKernel() { } -void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +void NEPadLayerKernel::configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Auto-init - const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding); - const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = + arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding); + const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape); auto_init_if_empty(*output->info(), expected_output_info); // Perform validation step @@ -200,14 +216,14 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL _constant_value = constant_value; _mode = mode; - if(_mode == PaddingMode::CONSTANT) + if (_mode == PaddingMode::CONSTANT) { - switch(_input->info()->element_size()) + switch (_input->info()->element_size()) { case 1: - if(_input->info()->num_dimensions() == 3 && // Is 3D - padding.size() <= 3 && // Has 3D padding - !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding + if (_input->info()->num_dimensions() == 3 && // Is 3D + padding.size() <= 3 && // Has 3D padding + !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding { _func = &NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad; } @@ -240,7 +256,11 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL ICPPKernel::configure(win); } -Status NEPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +Status NEPadLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_UNUSED(constant_value); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, mode)); @@ -253,7 +273,7 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - if(_func != nullptr) + if (_func != nullptr) { (this->*_func)(window); } @@ -263,7 +283,7 @@ size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) c { ARM_COMPUTE_UNUSED(thread_count); ARM_COMPUTE_UNUSED(platform); - + return ICPPKernel::default_mws; } diff --git a/src/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h index f82af1558a..d432887d2c 100644 --- a/src/core/NEON/kernels/NEPadLayerKernel.h +++ b/src/core/NEON/kernels/NEPadLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEPADLAYERKERNEL_H #include "arm_compute/core/PixelValue.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -62,7 +63,11 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT. * Only CONSTANT padding mode is currently supported */ - void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT); + void configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value = PixelValue(), + const PaddingMode mode = PaddingMode::CONSTANT); /** Static function to check if given info will lead to a valid configuration of @ref NEPadLayer. * * @param[in] input Source tensor info. Data types supported: All. @@ -75,7 +80,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value = PixelValue(), + const PaddingMode mode = PaddingMode::CONSTANT); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp index 3d89933377..15e933e66e 100644 --- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -36,7 +37,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status validate_arguments(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32); @@ -45,10 +49,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, // Check variances const int var_size = info.variances().size(); - if(var_size > 1) + if (var_size > 1) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values"); - for(int i = 0; i < var_size; ++i) + for (int i = 0; i < var_size; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0"); } @@ -56,17 +60,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0"); - if(!info.max_sizes().empty()) + if (!info.max_sizes().empty()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), + "Max and min sizes dimensions should match"); } - for(unsigned int i = 0; i < info.max_sizes().size(); ++i) + for (unsigned int i = 0; i < info.max_sizes().size(); ++i) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], + "Max size should be greater than min size"); } - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); @@ -76,21 +82,26 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, } } // namespace -NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr), _info() +NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr), _info() { } -void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, - const int height) +void NEPriorBoxLayerKernel::store_coordinates(float *out, + const int offset, + const float center_x, + const float center_y, + const float box_width, + const float box_height, + const int width, + const int height) { float xmin = (center_x - box_width / 2.f) / width; float ymin = (center_y - box_height / 2.f) / height; float xmax = (center_x + box_width / 2.f) / width; float ymax = (center_y + box_height / 2.f) / height; - float32x4_t vec_elements = { xmin, ymin, xmax, ymax }; - if(_info.clip()) + float32x4_t vec_elements = {xmin, ymin, xmax, ymax}; + if (_info.clip()) { static const float32x4_t CONST_0 = vdupq_n_f32(0.f); static const float32x4_t CONST_1 = vdupq_n_f32(1.f); @@ -112,7 +123,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window) int img_width = _info.img_size().x; int img_height = _info.img_size().y; - if(img_width == 0 || img_height == 0) + if (img_width == 0 || img_height == 0) { img_width = _input2->info()->dimension(width_idx); img_height = _input2->info()->dimension(height_idx); @@ -120,7 +131,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window) float step_x = _info.steps()[0]; float step_y = _info.steps()[1]; - if(step_x == 0.f || step_y == 0.f) + if (step_x == 0.f || step_y == 0.f) { step_x = static_cast(img_width) / layer_width; step_y = static_cast(img_height) / layer_height; @@ -130,74 +141,80 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window) slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2)); Iterator output(_output, slice); - execute_window_loop(slice, [&](const Coordinates & id) - { - float center_x = 0; - float center_y = 0; - int idx = id.x() / (4 * num_priors); - center_x = (static_cast(idx % layer_width) + _info.offset()) * step_x; - center_y = (static_cast(idx / layer_width) + _info.offset()) * step_y; - - float box_width; - float box_height; - int offset = 0; - - auto out = reinterpret_cast(output.ptr()); - for(unsigned int i = 0; i < _info.min_sizes().size(); ++i) + execute_window_loop( + slice, + [&](const Coordinates &id) { - const float min_size = _info.min_sizes().at(i); - box_width = min_size; - box_height = min_size; - store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); - offset += 4; - - if(!_info.max_sizes().empty()) + float center_x = 0; + float center_y = 0; + int idx = id.x() / (4 * num_priors); + center_x = (static_cast(idx % layer_width) + _info.offset()) * step_x; + center_y = (static_cast(idx / layer_width) + _info.offset()) * step_y; + + float box_width; + float box_height; + int offset = 0; + + auto out = reinterpret_cast(output.ptr()); + for (unsigned int i = 0; i < _info.min_sizes().size(); ++i) { - const float max_size = _info.max_sizes().at(i); - box_width = std::sqrt(min_size * max_size); - box_height = box_width; - + const float min_size = _info.min_sizes().at(i); + box_width = min_size; + box_height = min_size; store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); offset += 4; - } - // rest of priors - for(auto ar : _info.aspect_ratios()) - { - if(fabs(ar - 1.) < 1e-6) + if (!_info.max_sizes().empty()) { - continue; + const float max_size = _info.max_sizes().at(i); + box_width = std::sqrt(min_size * max_size); + box_height = box_width; + + store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); + offset += 4; } - box_width = min_size * sqrt(ar); - box_height = min_size / sqrt(ar); + // rest of priors + for (auto ar : _info.aspect_ratios()) + { + if (fabs(ar - 1.) < 1e-6) + { + continue; + } - store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); - offset += 4; + box_width = min_size * sqrt(ar); + box_height = min_size / sqrt(ar); + + store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); + offset += 4; + } } - } - // set the variance - out = reinterpret_cast(_output->ptr_to_element(Coordinates(id.x(), 1))); - float32x4_t var; - if(_info.variances().size() == 1) - { - var = vdupq_n_f32(_info.variances().at(0)); - } - else - { - const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) }; - var = vars; - } - for(int i = 0; i < num_priors; ++i) - { - vst1q_f32(out + 4 * i, var); - } - }, - output); + // set the variance + out = reinterpret_cast(_output->ptr_to_element(Coordinates(id.x(), 1))); + float32x4_t var; + if (_info.variances().size() == 1) + { + var = vdupq_n_f32(_info.variances().at(0)); + } + else + { + const float32x4_t vars = {_info.variances().at(0), _info.variances().at(1), _info.variances().at(2), + _info.variances().at(3)}; + var = vars; + } + for (int i = 0; i < num_priors; ++i) + { + vst1q_f32(out + 4 * i, var); + } + }, + output); } -void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info) +void NEPriorBoxLayerKernel::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); @@ -215,7 +232,10 @@ void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *inpu INEKernel::configure(win); } -Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info)); @@ -231,4 +251,4 @@ void NEPriorBoxLayerKernel::run(const Window &window, const ThreadInfo &info) // Run function calculate_prior_boxes(window); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h index 430a47f9f8..460f80e085 100644 --- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h +++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h @@ -67,7 +67,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -84,7 +87,14 @@ private: * @param[in] width Input width. * @param[in] height Input height. */ - void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height); + void store_coordinates(float *out, + const int offset, + const float center_x, + const float center_y, + const float box_width, + const float box_height, + const int width, + const int height); /** Function to calculate prior boxes. * * @param[in] window Input region on which to execute the kernel. diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp index 46a0f625ce..8e1ed3a2a5 100644 --- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp @@ -26,17 +26,17 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/NESymm.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/NESymm.h" #include @@ -72,8 +72,8 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4 const int64_t b_3 = vgetlane(b_high, 1); int64x2x2_t result; - const int64x2_t result_0{ a_0 * b_0, a_1 * b_1 }; - const int64x2_t result_1{ a_2 * b_2, a_3 * b_3 }; + const int64x2_t result_0{a_0 * b_0, a_1 * b_1}; + const int64x2_t result_1{a_2 * b_2, a_3 * b_3}; result.val[0] = vadd(vmovl(vgetlow(bias)), result_0); result.val[1] = vadd(vmovl(vgethigh(bias)), result_1); @@ -81,15 +81,17 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4 } } // namespace -void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias) +void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, + ITensor *output, + const ITensor *weight, + const ITensor *bias) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output); ARM_COMPUTE_ERROR_ON(input == output); ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), weight->info(), bias->info())); - static const std::map fn_map = - { - { DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16) }, + static const std::map fn_map = { + {DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16)}, }; _input = input; @@ -102,10 +104,10 @@ void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *o _output->info()->set_quantization_info(compute_output_qinfo()); const UniformQuantizationInfo wq_info = _weight->info()->quantization_info().uniform(); - const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift); + const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift); _output_shift *= -1; - if(!bool(s)) + if (!bool(s)) { _output_multiplier = 0; _output_shift = 0; @@ -134,7 +136,10 @@ Window NEQLSTMLayerNormalizationKernel::configure_window(ITensor *target) return window; } -Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *weight, + const ITensorInfo *bias) { ARM_COMPUTE_UNUSED(output, bias, weight, input); @@ -151,7 +156,7 @@ Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().x() != weight->tensor_shape().x()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -182,11 +187,11 @@ inline std::pair NEQLSTMLayerNormalizationKernel::sum_qsymm16( using AccType = int64_t; using InputDataType = int16_t; - AccType sum{ 0 }; - AccType sum_sq{ 0 }; + AccType sum{0}; + AccType sum_sq{0}; int32_t x = _window_start_x; - for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) + for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) { using namespace wrapper; const int16x8_t val = vloadq(input_ptr + x); @@ -216,7 +221,7 @@ inline std::pair NEQLSTMLayerNormalizationKernel::sum_qsymm16( #endif // __aarch64__ } - for(; x < _window_end_x; ++x) + for (; x < _window_end_x; ++x) { const InputDataType val = input_ptr[x]; sum += static_cast(val); @@ -230,7 +235,9 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i int16_t *output_ptr, const int16_t *weight_ptr, const int32_t *bias_ptr, - int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift) + int32_t mean, + int32_t inv_std_mul, + int32_t inv_std_shift) { using OutputDataType = int16_t; @@ -238,7 +245,7 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i const int32x4_t mean_vec = vdup_n(mean, wrapper::traits::vector_128_tag{}); int32_t x = _window_start_x; - for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) + for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) { const int16x8_t val = vloadq(input_ptr + x); int32x4x2_t shifted; @@ -267,16 +274,18 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i vstore(output_ptr + x + 4, vqmovn(out_val.val[1])); } - for(; x < _window_end_x; ++x) + for (; x < _window_end_x; ++x) { - const auto val = static_cast(input_ptr[x]); - const int32_t shifted = (val << 10) - mean; - const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift); - const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x]; + const auto val = static_cast(input_ptr[x]); + const int32_t shifted = (val << 10) - mean; + const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift); + const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x]; const auto reverse_shifted = static_cast((weighted + 512) >> 10); - int32_t out_val = quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12); - out_val = utility::clamp(out_val, std::numeric_limits::min()); - output_ptr[x] = static_cast(out_val); + int32_t out_val = + quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12); + out_val = + utility::clamp(out_val, std::numeric_limits::min()); + output_ptr[x] = static_cast(out_val); } } @@ -287,35 +296,38 @@ void NEQLSTMLayerNormalizationKernel::compute_qsymm16() using BiasDataType = int32_t; using AccType = int64_t; - Iterator input_iterator{ _input, _inout_window }; - Iterator output_iterator{ _output, _inout_window }; - Iterator weight_iterator{ _weight, _weight_window }; - Iterator bias_iterator{ _bias, _weight_window }; + Iterator input_iterator{_input, _inout_window}; + Iterator output_iterator{_output, _inout_window}; + Iterator weight_iterator{_weight, _weight_window}; + Iterator bias_iterator{_bias, _weight_window}; const auto weight_ptr = reinterpret_cast(weight_iterator.ptr()); const auto bias_ptr = reinterpret_cast(bias_iterator.ptr()); const uint32_t column_size = _input->info()->tensor_shape()[0]; - execute_window_loop(_inout_window, [ &, this](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(input_iterator.ptr()); - auto out_ptr = reinterpret_cast(output_iterator.ptr()); - - AccType sum{ 0 }; - AccType sum_sq{ 0 }; - std::tie(sum, sum_sq) = sum_qsymm16(in_ptr); - - AccType mean{ 0 }; - AccType variance{ 0 }; - std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size); - - int32_t stddev_invsqrt_mul{}; - int32_t stddev_invsqrt_shift{}; - quantization::get_invsqrt_quantized_multiplier_exp(static_cast(variance), -1, stddev_invsqrt_mul, stddev_invsqrt_shift); - - normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift); - }, - input_iterator, output_iterator); + execute_window_loop( + _inout_window, + [&, this](const Coordinates &) + { + const auto in_ptr = reinterpret_cast(input_iterator.ptr()); + auto out_ptr = reinterpret_cast(output_iterator.ptr()); + + AccType sum{0}; + AccType sum_sq{0}; + std::tie(sum, sum_sq) = sum_qsymm16(in_ptr); + + AccType mean{0}; + AccType variance{0}; + std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size); + + int32_t stddev_invsqrt_mul{}; + int32_t stddev_invsqrt_shift{}; + quantization::get_invsqrt_quantized_multiplier_exp(static_cast(variance), -1, stddev_invsqrt_mul, + stddev_invsqrt_shift); + + normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift); + }, + input_iterator, output_iterator); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h index a3ff6e988f..af5b6a0315 100644 --- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h +++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H #include "src/core/NEON/INEKernel.h" + #include namespace arm_compute @@ -69,34 +70,26 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; private: // constants - static constexpr uint32_t max_input_dimension{ 2 }; /**< The maximum input dimension supported */ - static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */ - static constexpr uint32_t max_bias_dimension{ 1 }; /**< The maximum bias dimension supported */ - static constexpr uint32_t vector_size_byte{ 16 }; /**< Computation vector size in byte */ + static constexpr uint32_t max_input_dimension{2}; /**< The maximum input dimension supported */ + static constexpr uint32_t max_weight_dimension{1}; /**< The maximum weight dimension supported */ + static constexpr uint32_t max_bias_dimension{1}; /**< The maximum bias dimension supported */ + static constexpr uint32_t vector_size_byte{16}; /**< Computation vector size in byte */ using ComputeFuncType = std::function; ComputeFuncType _fn{}; /**< Function pointer to computation function */ - const ITensor *_input - { - nullptr - }; /**< Input tensor */ - const ITensor *_weight - { - nullptr - }; /**< Weight tensor */ - const ITensor *_bias - { - nullptr - }; /**< Bias tensor */ - ITensor *_output{ nullptr }; /**< Output tensor */ + const ITensor *_input{nullptr}; /**< Input tensor */ + const ITensor *_weight{nullptr}; /**< Weight tensor */ + const ITensor *_bias{nullptr}; /**< Bias tensor */ + ITensor *_output{nullptr}; /**< Output tensor */ int32_t _output_multiplier{}; /**< Multiplier for output values */ int32_t _output_shift{}; /**< Shift value for output values */ @@ -138,7 +131,9 @@ private: int16_t *output_ptr, const int16_t *weight_ptr, const int32_t *bias_ptr, - int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift); + int32_t mean, + int32_t inv_std_mul, + int32_t inv_std_shift); /** Function to compute output quantization information */ QuantizationInfo compute_output_qinfo(); }; diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp index 802aebb526..486cd6d331 100644 --- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp +++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/misc/Utility.h" -#include "src/core/CPP/Validate.h" +#include "arm_compute/core/Window.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/roialign/list.h" @@ -49,7 +50,12 @@ struct ROIAlignSelectorData }; using ROIAlignSelctorPtr = std::add_pointer::type; -using ROIAlignUKernelPtr = std::add_pointer::type; +using ROIAlignUKernelPtr = std::add_pointer::type; struct ROIAlignKernel { @@ -58,31 +64,18 @@ struct ROIAlignKernel ROIAlignUKernelPtr ukernel; }; -static const ROIAlignKernel available_kernels[] = -{ - { - "fp32_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign) - }, +static const ROIAlignKernel available_kernels[] = { + {"fp32_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign) - }, + {"fp16_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #if defined(ARM_COMPUTE_ENABLE_NEON) - { - "qu8_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign) - }, - { - "qs8_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign) - }, + {"qu8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)}, + {"qs8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)}, #endif //defined(ARM_COMPUTE_ENABLE_NEON) }; @@ -94,9 +87,9 @@ static const ROIAlignKernel available_kernels[] = */ const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -104,24 +97,29 @@ const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data) return nullptr; } -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5); ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), + output->tensor_shape()); } - if(input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED) + if (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16); @@ -143,13 +141,17 @@ NEROIAlignLayerKernel::NEROIAlignLayerKernel() { } -void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIAlignLayerKernel::configure(const ITensor *input, + const ITensor *rois, + ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info)); // Output auto inizialitation if not yet initialized const TensorShape output_shape = compute_roi_align_shape(*input->info(), *rois->info(), pool_info); - auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); output->info()->set_data_layout(input->info()->data_layout()); // Configure kernel window @@ -167,7 +169,10 @@ void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, INEKernel::configure(window); } -Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info)); return Status{}; @@ -176,9 +181,9 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info) { const DataLayout data_layout = _input->info()->data_layout(); - if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC) { - const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() }); + const auto *uk = get_implementation(ROIAlignSelectorData{_input->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_input, _output, _rois, _pool_info, window, info); diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h index 48a3de7285..9cc538b429 100644 --- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h +++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h @@ -83,7 +83,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp index 400e8291d6..1a3810fb56 100644 --- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp @@ -22,9 +22,11 @@ * SOFTWARE. */ #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h" + #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -36,7 +38,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, rois); @@ -47,10 +52,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height())); + ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || + (output->dimension(1) != pool_info.pooled_height())); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2)); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3)); } @@ -73,19 +79,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con * @param[in] roi_indx Index of image of coordinate in output Tensor to store value */ template -void template_eval(const ITensor *input, const ITensor *output, int region_start_x, int region_start_y, - int region_end_x, int region_end_y, int fm, int px, int py, int roi_batch, int roi_indx) +void template_eval(const ITensor *input, + const ITensor *output, + int region_start_x, + int region_start_y, + int region_end_x, + int region_end_y, + int fm, + int px, + int py, + int roi_batch, + int roi_indx) { - if((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) + if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) { *reinterpret_cast(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = 0; } else { T curr_max = std::numeric_limits::lowest(); // Min value of typename T - for(int j = region_start_y; j < region_end_y; ++j) + for (int j = region_start_y; j < region_end_y; ++j) { - for(int i = region_start_x; i < region_end_x; ++i) + for (int i = region_start_x; i < region_end_x; ++i) { const auto val = *reinterpret_cast(input->ptr_to_element(Coordinates(i, j, fm, roi_batch))); curr_max = std::max(val, curr_max); @@ -93,11 +108,13 @@ void template_eval(const ITensor *input, const ITensor *output, int region_start } // if quantized datatype, requantize then store in output tensor - if(is_data_type_quantized(input->info()->data_type())) + if (is_data_type_quantized(input->info()->data_type())) { // covert qasymm to new output quantization scale and offset - UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform()); - *reinterpret_cast(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = quantize_qasymm8(curr_max, uqinfo); + UniformQuantizationInfo uqinfo = compute_requantization_scale_offset( + input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform()); + *reinterpret_cast(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = + quantize_qasymm8(curr_max, uqinfo); } else { @@ -112,13 +129,19 @@ NEROIPoolingLayerKernel::NEROIPoolingLayerKernel() { } -Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info)); return Status{}; } -void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIPoolingLayerKernel::configure(const ITensor *input, + const ITensor *rois, + const ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois); @@ -126,12 +149,15 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *roi ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info)); // Output auto initialization if not yet initialized - TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1)); + TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), + rois->info()->dimension(1)); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), output->info()->quantization_info()); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + output->info()->quantization_info()); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height())); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || + (output->info()->dimension(1) != pool_info.pooled_height())); // Set instance variables _input = input; @@ -167,7 +193,7 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) const auto *rois_ptr = reinterpret_cast(_rois->buffer()); const auto data_type = _input->info()->data_type(); - for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) + for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) { const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx]; const auto x1 = rois_ptr[values_per_roi * roi_indx + 1]; @@ -182,30 +208,35 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) const int roi_height = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f); // Iterate through all feature maps - for(int fm = 0; fm < fms; ++fm) + for (int fm = 0; fm < fms; ++fm) { // Iterate through all output pixels - for(int py = 0; py < pooled_h; ++py) + for (int py = 0; py < pooled_h; ++py) { - for(int px = 0; px < pooled_w; ++px) + for (int px = 0; px < pooled_w; ++px) { auto region_start_x = static_cast(std::floor((static_cast(px) / pooled_w) * roi_width)); - auto region_end_x = static_cast(std::floor((static_cast(px + 1) / pooled_w) * roi_width)); - auto region_start_y = static_cast(std::floor((static_cast(py) / pooled_h) * roi_height)); - auto region_end_y = static_cast(std::floor((static_cast(py + 1) / pooled_h) * roi_height)); + auto region_end_x = + static_cast(std::floor((static_cast(px + 1) / pooled_w) * roi_width)); + auto region_start_y = + static_cast(std::floor((static_cast(py) / pooled_h) * roi_height)); + auto region_end_y = + static_cast(std::floor((static_cast(py + 1) / pooled_h) * roi_height)); region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width); region_end_x = std::min(std::max(region_end_x + roi_anchor_x, 0), width); region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height); region_end_y = std::min(std::max(region_end_y + roi_anchor_y, 0), height); - switch(data_type) + switch (data_type) { case DataType::F32: - template_eval(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx); + template_eval(_input, _output, region_start_x, region_start_y, region_end_x, + region_end_y, fm, px, py, roi_batch, roi_indx); break; case DataType::QASYMM8: - template_eval(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx); + template_eval(_input, _output, region_start_x, region_start_y, region_end_x, + region_end_y, fm, px, py, roi_batch, roi_indx); break; default: ARM_COMPUTE_ERROR("DataType not Supported"); @@ -216,4 +247,4 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) } } } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h index e7a7e90eef..81f6006ea2 100644 --- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h +++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h @@ -63,7 +63,8 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor. */ - void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info); + void + configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -82,7 +83,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); private: const ITensor *_input; diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp index ec63a35de9..87b7b76b72 100644 --- a/src/core/NEON/kernels/NERangeKernel.cpp +++ b/src/core/NEON/kernels/NERangeKernel.cpp @@ -29,11 +29,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/range/list.h" namespace arm_compute @@ -55,48 +56,23 @@ struct RangeUKernel RangeUKernelPtr ukernel; }; -static const RangeUKernel available_kernels[] = -{ - { - "fp16_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function) - }, - { - "f32_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function) - }, - { - "u8_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function) - }, - { - "u16_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::U16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function) - }, - { - "u32_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::U32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function) - }, - { - "s8_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::S8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function) - }, - { - "s16_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function) - }, - { - "s32_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function) - }, +static const RangeUKernel available_kernels[] = { + {"fp16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)}, + {"f32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)}, + {"u8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)}, + {"u16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)}, + {"u32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)}, + {"s8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)}, + {"s16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)}, + {"s32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)}, }; /** Micro-kernel selector @@ -107,9 +83,9 @@ static const RangeUKernel available_kernels[] = */ const RangeUKernel *get_implementation(const RangeSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -119,28 +95,31 @@ const RangeUKernel *get_implementation(const RangeSelectorData &data) Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step) { - const auto *uk = get_implementation(RangeSelectorData{ output.data_type() }); + const auto *uk = get_implementation(RangeSelectorData{output.data_type()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), + "start value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), + "end value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), + "step value is outside the range of the data type"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), + "Output tensor size is incorrect"); return Status{}; } } // namespace -NERangeKernel::NERangeKernel() - : _start(0), _end(1), _step(1), _output(nullptr) +NERangeKernel::NERangeKernel() : _start(0), _end(1), _step(1), _output(nullptr) { } @@ -151,7 +130,8 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step)); // Auto initialize output if not initialized - auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, output->info()->data_type(), output->info()->quantization_info()); + auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, + output->info()->data_type(), output->info()->quantization_info()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -178,7 +158,7 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() }); + const auto *uk = get_implementation(RangeSelectorData{_output->info()->data_type()}); uk->ukernel(_output, _start, _step, window); } diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h index 90560995e6..fa555c2c2e 100644 --- a/src/core/NEON/kernels/NERangeKernel.h +++ b/src/core/NEON/kernels/NERangeKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NERANGEKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index 19955af493..455d604b3b 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -28,16 +28,17 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/INEKernel.h" -#include "src/core/NEON/NEMath.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/INEKernel.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "support/SaturateCast.h" -#include "src/core/NEON/wrapper/wrapper.h" #include namespace arm_compute @@ -48,7 +49,7 @@ namespace template void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0) { - if(std::is_same::value) + if (std::is_same::value) { auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2)); wrapper::vstore(output.ptr() + offset, res); @@ -63,8 +64,8 @@ void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset template uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) { - uint32x4_t mask{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint32x4_t mask{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { mask = wrapper::vcgt(b, a); } @@ -73,12 +74,12 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp mask = wrapper::vclt(b, a); } - uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 }; - if(axis != 0) + uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3}; + if (axis != 0) { vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); } - uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } }; + uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}}; return res; } @@ -86,9 +87,9 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp template uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) { - uint32x4x4_t mask{ { 0 } }; - uint8x16_t mask_u8{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint32x4x4_t mask{{0}}; + uint8x16_t mask_u8{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { mask_u8 = wrapper::vcgt(b, a); } @@ -96,44 +97,43 @@ uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, R { mask_u8 = wrapper::vclt(b, a); } - auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); - auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); - mask.val[0] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); - mask.val[1] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); - mask.val[2] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); - mask.val[3] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); - - uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 }, - { idx + 4, idx + 5, idx + 6, idx + 7 }, - { idx + 8, idx + 9, idx + 10, idx + 11 }, - { idx + 12, idx + 13, idx + 14, idx + 15 } - } - }; - if(axis != 0) + auto wide_u16_1 = + wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); + auto wide_u16_2 = + wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); + mask.val[0] = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); + mask.val[1] = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); + mask.val[2] = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); + mask.val[3] = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); + + uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, + {idx + 4, idx + 5, idx + 6, idx + 7}, + {idx + 8, idx + 9, idx + 10, idx + 11}, + {idx + 12, idx + 13, idx + 14, idx + 15}}}; + if (axis != 0) { vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); } - uint32x4x4_t res = - { - { - vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), - vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]), - vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), - vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3]) - } - }; + uint32x4x4_t res = { + {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]), + vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}}; return res; } // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. template -inline typename std::enable_if < std::is_same::value || std::is_same::value, - typename std::conditional::value, float32x2_t, int32x2_t>::type >::type - calculate_min(T in) +inline typename std::enable_if< + std::is_same::value || std::is_same::value, + typename std::conditional::value, float32x2_t, int32x2_t>::type>::type +calculate_min(T in) { auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); return wrapper::vpmin(pmin, pmin); @@ -141,9 +141,10 @@ inline typename std::enable_if < std::is_same::value || std::is_ // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. template -inline typename std::enable_if < std::is_same::value || std::is_same::value, - typename std::conditional::value, uint8x8_t, int8x8_t>::type >::type - calculate_min(T in) +inline typename std::enable_if< + std::is_same::value || std::is_same::value, + typename std::conditional::value, uint8x8_t, int8x8_t>::type>::type +calculate_min(T in) { auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); pmin = wrapper::vpmin(pmin, pmin); @@ -153,9 +154,10 @@ inline typename std::enable_if < std::is_same::value || std::is_s // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. template -inline typename std::enable_if < std::is_same::value || std::is_same::value, - typename std::conditional::value, float32x2_t, int32x2_t>::type >::type - calculate_max(T in) +inline typename std::enable_if< + std::is_same::value || std::is_same::value, + typename std::conditional::value, float32x2_t, int32x2_t>::type>::type +calculate_max(T in) { auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); return wrapper::vpmax(pmax, pmax); @@ -163,9 +165,10 @@ inline typename std::enable_if < std::is_same::value || std::is_ // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. template -inline typename std::enable_if < std::is_same::value || std::is_same::value, - typename std::conditional::value, uint8x8_t, int8x8_t>::type >::type - calculate_max(T in) +inline typename std::enable_if< + std::is_same::value || std::is_same::value, + typename std::conditional::value, uint8x8_t, int8x8_t>::type>::type +calculate_max(T in) { auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); pmax = wrapper::vpmax(pmax, pmax); @@ -176,10 +179,10 @@ inline typename std::enable_if < std::is_same::value || std::is_s template uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) { - uint32x4_t res_idx_mask{ 0 }; + uint32x4_t res_idx_mask{0}; uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); - if(op == ReductionOperation::ARG_IDX_MIN) + if (op == ReductionOperation::ARG_IDX_MIN) { auto pmin = calculate_min(vec_res_value); auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); @@ -203,10 +206,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, Reduc template uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) { - uint32x4x4_t res_idx_mask{ { 0 } }; + uint32x4x4_t res_idx_mask{{0}}; uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); - uint8x16_t mask_u8{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint8x16_t mask_u8{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { auto pmin = calculate_min(vec_res_value); mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); @@ -218,12 +221,18 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va } // Widen vectors - auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); - auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); - auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); - auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); - auto wide_u32_3 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); - auto wide_u32_4 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); + auto wide_u16_1 = + wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); + auto wide_u16_2 = + wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); + auto wide_u32_1 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); + auto wide_u32_2 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); + auto wide_u32_3 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); + auto wide_u32_4 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3); @@ -241,19 +250,19 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va pmin = wrapper::vpmin(pmin, pmin); res = std::min(wrapper::vgetlane(pmin, 0), res); iter++; - } - while(iter < 4); + } while (iter < 4); return (res - 0xFFFFFFFF); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <> -uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis) +uint32x4x4_t +calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis) { - uint32x4x2_t mask{ 0 }; - uint16x8_t mask_u16{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint32x4x2_t mask{0}; + uint16x8_t mask_u16{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { mask_u16 = wrapper::vcgt(b, a); } @@ -263,19 +272,14 @@ uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x } mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16)); mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16)); - uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 }, - { idx + 4, idx + 5, idx + 6, idx + 7 } - } - }; - if(axis != 0) + uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}}; + if (axis != 0) { vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); } - uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]), - wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), - 0, 0 - }; + uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]), + wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0}; return res; } @@ -298,10 +302,10 @@ inline float16x4_t calculate_max(float16x8_t in) template <> uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op) { - uint32x4x2_t res_idx_mask{ 0 }; + uint32x4x2_t res_idx_mask{0}; uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); uint16x8_t mask_u16; - if(op == ReductionOperation::ARG_IDX_MIN) + if (op == ReductionOperation::ARG_IDX_MIN) { auto pmin = calculate_min(vec_res_value); mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); @@ -313,8 +317,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va } // Widen vectors - auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16))); - auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16))); + auto wide_u32_1 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16))); + auto wide_u32_2 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16))); res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones); @@ -328,8 +334,7 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va pmin = wrapper::vpmin(pmin, pmin); res = std::min(wrapper::vgetlane(pmin, 0), res); iter++; - } - while(iter < 2); + } while (iter < 2); return (res - 0xFFFFFFFF); } @@ -388,7 +393,8 @@ struct RedOpX /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) { const size_t input_dim_0 = in->info()->dimension(0); const int window_step_x = 16 / sizeof(T); @@ -402,211 +408,217 @@ struct RedOpX Iterator output(out, out_window); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - - auto init_res_value = static_cast(0.f); - switch(op) + in_win_no_pad, + [&](const Coordinates &) { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - init_res_value = static_cast(*input_ptr); - break; - } - case ReductionOperation::PROD: - { - init_res_value = static_cast(1.f); - break; - } - default: - break; - } - auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); - uint32x4x4_t vec_res_idx{ { 0 } }; + const auto input_ptr = reinterpret_cast(input.ptr()); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vec_elements = wrapper::vloadq(input_ptr + x); - switch(op) + auto init_res_value = static_cast(0.f); + switch (op) { - case ReductionOperation::SUM_SQUARE: - vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); - break; - case ReductionOperation::MEAN_SUM: - case ReductionOperation::SUM: - vec_res_value = wrapper::vadd(vec_elements, vec_res_value); - break; - case ReductionOperation::PROD: - vec_res_value = wrapper::vmul(vec_elements, vec_res_value); - break; - case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; - break; - } case ReductionOperation::ARG_IDX_MAX: - { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; - break; - } + case ReductionOperation::ARG_IDX_MIN: case ReductionOperation::MIN: + case ReductionOperation::MAX: { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + init_res_value = static_cast(*input_ptr); break; } - case ReductionOperation::MAX: + case ReductionOperation::PROD: { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + init_res_value = static_cast(1.f); break; } default: - ARM_COMPUTE_ERROR("Not supported"); + break; } - } + auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); + uint32x4x4_t vec_res_idx{{0}}; - switch(op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - case ReductionOperation::SUM_SQUARE: + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { -#ifdef ARM_COMPUTE_DEBUG_ENABLED - auto res = static_cast(0.f); - for(int i = 0; i < S; ++i) + const auto vec_elements = wrapper::vloadq(input_ptr + x); + switch (op) { - res += wrapper::vgetlane(vec_res_value, i); + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::MEAN_SUM: + case ReductionOperation::SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::PROD: + vec_res_value = wrapper::vmul(vec_elements, vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index(x, temp_vec_res_value, vec_res_value, + vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index(x, temp_vec_res_value, vec_res_value, + vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } -#else // ARM_COMPUTE_DEBUG_ENABLED - auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); - for(int i = 0; i < S / 4; ++i) + } + + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + case ReductionOperation::SUM_SQUARE: { - carry_res = wrapper::vpadd(carry_res, carry_res); - } - auto res = wrapper::vgetlane(carry_res, 0); +#ifdef ARM_COMPUTE_DEBUG_ENABLED + auto res = static_cast(0.f); + for (int i = 0; i < S; ++i) + { + res += wrapper::vgetlane(vec_res_value, i); + } +#else // ARM_COMPUTE_DEBUG_ENABLED + auto carry_res = + wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + for (int i = 0; i < S / 4; ++i) + { + carry_res = wrapper::vpadd(carry_res, carry_res); + } + auto res = wrapper::vgetlane(carry_res, 0); #endif // ARM_COMPUTE_DEBUG_ENABLED - if(op == ReductionOperation::SUM_SQUARE) - { - // Compute left-over elements - for(; x < window_end_x; ++x) + if (op == ReductionOperation::SUM_SQUARE) { - res += (*(input_ptr + x)) * (*(input_ptr + x)); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += (*(input_ptr + x)) * (*(input_ptr + x)); + } + } + else + { + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += *(input_ptr + x); + } + } + + if (op == ReductionOperation::MEAN_SUM) + { + res /= input_dim_0; } + + *(reinterpret_cast(output.ptr())) = res; + break; } - else + case ReductionOperation::PROD: { + auto carry_res = + wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + T res = 1; + for (int i = 0; i < S / 2; ++i) + { + res *= wrapper::vgetlane(carry_res, i); + } + // Compute left-over elements - for(; x < window_end_x; ++x) + for (; x < window_end_x; ++x) { - res += *(input_ptr + x); + res *= *(input_ptr + x); } - } - if(op == ReductionOperation::MEAN_SUM) - { - res /= input_dim_0; + *(reinterpret_cast(output.ptr())) = res; + break; } - - *(reinterpret_cast(output.ptr())) = res; - break; - } - case ReductionOperation::PROD: - { - auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); - T res = 1; - for(int i = 0; i < S / 2; ++i) + case ReductionOperation::ARG_IDX_MIN: { - res *= wrapper::vgetlane(carry_res, i); - } + auto idx = calculate_vector_index(vec_res_idx, vec_res_value, op); + auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res *= *(input_ptr + x); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) < res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast(output.ptr())) = idx; + break; } - - *(reinterpret_cast(output.ptr())) = res; - break; - } - case ReductionOperation::ARG_IDX_MIN: - { - auto idx = calculate_vector_index(vec_res_idx, vec_res_value, op); - auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::ARG_IDX_MAX: { - if(*(input_ptr + x) < res) + auto idx = calculate_vector_index(vec_res_idx, vec_res_value, op); + auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) { - idx = x; - res = *(input_ptr + x); + if (*(input_ptr + x) > res) + { + idx = x; + res = *(input_ptr + x); + } } + *(reinterpret_cast(output.ptr())) = idx; + break; } - *(reinterpret_cast(output.ptr())) = idx; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto idx = calculate_vector_index(vec_res_idx, vec_res_value, op); - auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::MIN: { - if(*(input_ptr + x) > res) + auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) { - idx = x; - res = *(input_ptr + x); + res = *(input_ptr + x) < res ? *(input_ptr + x) : res; } + *(reinterpret_cast(output.ptr())) = res; + break; } - *(reinterpret_cast(output.ptr())) = idx; - break; - } - case ReductionOperation::MIN: - { - auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::MAX: { - res = *(input_ptr + x) < res ? *(input_ptr + x) : res; - } - *(reinterpret_cast(output.ptr())) = res; - break; - } - case ReductionOperation::MAX: - { - auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res = *(input_ptr + x) > res ? *(input_ptr + x) : res; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) > res ? *(input_ptr + x) : res; + } + *(reinterpret_cast(output.ptr())) = res; + break; } - *(reinterpret_cast(output.ptr())) = res; - break; + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - }, - input, output); + }, + input, output); } }; template struct RedOpX_quantized { - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) { using PromotedType = typename wrapper::traits::promote::type>::type; @@ -637,246 +649,257 @@ struct RedOpX_quantized const float B = out_offset - (in_scale * in_offset) / (out_scale); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); + in_win_no_pad, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + + auto vec_res_value1 = + wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value2 = + wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value3 = + wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value4 = + wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + + auto vec_res_value1_f = vdupq_n_f32(static_cast(1.f)); + auto vec_res_value2_f = vdupq_n_f32(static_cast(1.f)); + auto vec_res_value3_f = vdupq_n_f32(static_cast(1.f)); + auto vec_res_value4_f = vdupq_n_f32(static_cast(1.f)); + + typename wrapper::traits::neon_vector::type vec_res_value = {0}; + + if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || + op == ReductionOperation::MIN || op == ReductionOperation::MAX) + { + vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{}); + } - auto vec_res_value1 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value2 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value3 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value4 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + uint32x4x4_t vec_res_idx{{0}}; + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vec_elements = wrapper::vloadq(input_ptr + x); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - auto vec_res_value1_f = vdupq_n_f32(static_cast(1.f)); - auto vec_res_value2_f = vdupq_n_f32(static_cast(1.f)); - auto vec_res_value3_f = vdupq_n_f32(static_cast(1.f)); - auto vec_res_value4_f = vdupq_n_f32(static_cast(1.f)); + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - typename wrapper::traits::neon_vector::type vec_res_value = { 0 }; + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset); + const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale); - if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX) - { - vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{}); - } + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - uint32x4x4_t vec_res_idx{ { 0 } }; - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vec_elements = wrapper::vloadq(input_ptr + x); - switch(op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - { - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); - vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); - vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); - vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); - break; - } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset); - const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale); - - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - auto temp32x4f_1 = wrapper::vcvt(temp32x4t_1); - auto temp32x4f_2 = wrapper::vcvt(temp32x4t_2); - auto temp32x4f_3 = wrapper::vcvt(temp32x4t_3); - auto temp32x4f_4 = wrapper::vcvt(temp32x4t_4); - - //de-quantize vec_elements - temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4); - temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4); - temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4); - temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4); - - vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f); - vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f); - vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f); - vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f); - break; + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + auto temp32x4f_1 = wrapper::vcvt(temp32x4t_1); + auto temp32x4f_2 = wrapper::vcvt(temp32x4t_2); + auto temp32x4f_3 = wrapper::vcvt(temp32x4t_3); + auto temp32x4f_4 = wrapper::vcvt(temp32x4t_4); + + //de-quantize vec_elements + temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4); + temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4); + temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4); + temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4); + + vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f); + vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f); + vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f); + vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized( + x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized( + x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } + } + + switch (op) + { case ReductionOperation::ARG_IDX_MIN: { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; + auto idx = + calculate_vector_index_quantized(vec_res_idx, vec_res_value, op); + auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) < res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast(output.ptr())) = idx; break; } case ReductionOperation::ARG_IDX_MAX: { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; + auto idx = + calculate_vector_index_quantized(vec_res_idx, vec_res_value, op); + auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) > res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast(output.ptr())) = idx; break; } case ReductionOperation::MIN: { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) < res ? *(input_ptr + x) : res; + } + *(reinterpret_cast(output.ptr())) = res; break; } case ReductionOperation::MAX: { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch(op) - { - case ReductionOperation::ARG_IDX_MIN: - { - auto idx = calculate_vector_index_quantized(vec_res_idx, vec_res_value, op); - auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - if(*(input_ptr + x) < res) - { - idx = x; - res = *(input_ptr + x); - } - } - *(reinterpret_cast(output.ptr())) = idx; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto idx = calculate_vector_index_quantized(vec_res_idx, vec_res_value, op); - auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - if(*(input_ptr + x) > res) + // Compute left-over elements + for (; x < window_end_x; ++x) { - idx = x; - res = *(input_ptr + x); + res = *(input_ptr + x) > res ? *(input_ptr + x) : res; } + *(reinterpret_cast(output.ptr())) = res; + break; } - *(reinterpret_cast(output.ptr())) = idx; - break; - } - case ReductionOperation::MIN: - { - auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::PROD: { - res = *(input_ptr + x) < res ? *(input_ptr + x) : res; - } - *(reinterpret_cast(output.ptr())) = res; - break; - } - case ReductionOperation::MAX: - { - auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f); + carry_res = wrapper::vmul(carry_res, vec_res_value3_f); + carry_res = wrapper::vmul(carry_res, vec_res_value4_f); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res = *(input_ptr + x) > res ? *(input_ptr + x) : res; - } - *(reinterpret_cast(output.ptr())) = res; - break; - } - case ReductionOperation::PROD: - { - auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f); - carry_res = wrapper::vmul(carry_res, vec_res_value3_f); - carry_res = wrapper::vmul(carry_res, vec_res_value4_f); + float res = wrapper::vgetlane(carry_res, 0); + res *= wrapper::vgetlane(carry_res, 1); + res *= wrapper::vgetlane(carry_res, 2); + res *= wrapper::vgetlane(carry_res, 3); - float res = wrapper::vgetlane(carry_res, 0); - res *= wrapper::vgetlane(carry_res, 1); - res *= wrapper::vgetlane(carry_res, 2); - res *= wrapper::vgetlane(carry_res, 3); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + //de-quantize input + if (std::is_same::value) + { + res *= dequantize_qasymm8(*(input_ptr + x), iq_info); + } + else + { + res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info); + } + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - //de-quantize input - if(std::is_same::value) + //re-quantize result + if (std::is_same::value) { - res *= dequantize_qasymm8(*(input_ptr + x), iq_info); + res = quantize_qasymm8(res, iq_info); } else { - res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info); + res = quantize_qasymm8_signed(res, iq_info); } - } - //re-quantize result - if(std::is_same::value) - { - res = quantize_qasymm8(res, iq_info); + *reinterpret_cast(output.ptr()) = static_cast(res); + break; } - else + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: { - res = quantize_qasymm8_signed(res, iq_info); - } + auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2); + carry_res = wrapper::vadd(carry_res, vec_res_value3); + carry_res = wrapper::vadd(carry_res, vec_res_value4); - *reinterpret_cast(output.ptr()) = static_cast(res); - break; - } - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - { - auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2); - carry_res = wrapper::vadd(carry_res, vec_res_value3); - carry_res = wrapper::vadd(carry_res, vec_res_value4); + auto carry_paddition = + wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res)); + carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); + auto res = static_cast(wrapper::vgetlane(carry_paddition, 0)); - auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res)); - carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); - auto res = static_cast(wrapper::vgetlane(carry_paddition, 0)); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += *(input_ptr + x); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res += *(input_ptr + x); - } + if (op == ReductionOperation::MEAN_SUM) + { + const int32_t resFinal = A * (static_cast(res)) + B; - if(op == ReductionOperation::MEAN_SUM) - { - const int32_t resFinal = A * (static_cast(res)) + B; + *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(resFinal); + } + else + { + // Subtract accumulated offsets + res -= (in_info.dimension(0) - 1) * iq_info.offset; + *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(res); + } - *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(resFinal); - } - else - { - // Subtract accumulated offsets - res -= (in_info.dimension(0) - 1) * iq_info.offset; - *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(res); + break; } - - break; + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - }, - input, output); + }, + input, output); } }; @@ -887,7 +910,12 @@ struct RedOpYZW using ExactTagType = typename wrapper::traits::neon_vector::tag_type; using neon_vector = typename wrapper::traits::neon_vector::type; - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op) + inline void operator()(const Window &in_window, + Window &out_window, + const ITensor *in, + ITensor *out, + int axis, + const ReductionOperation op) { const TensorInfo in_info = *(in->info()); const int window_step_x = 16 / sizeof(T); @@ -900,203 +928,210 @@ struct RedOpYZW Window in_win_no_pad = in_window; in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + in_win_no_pad, + [&](const Coordinates &) { - neon_vector vec_res_value = { 0 }; - switch(op) - { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - vec_res_value = wrapper::vloadq(input_ptr + x); - break; - } - case ReductionOperation::PROD: - { - vec_res_value = wrapper::vdup_n(static_cast(1.f), ExactTagType{}); - break; - } - default: - { - vec_res_value = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - break; - } - } - uint32x4x4_t vec_res_idx{ { 0 } }; + const auto input_ptr = reinterpret_cast(input.ptr()); - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const T *in_ptr = reinterpret_cast(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); - const auto vec_elements = wrapper::vloadq(in_ptr); - switch(op) + neon_vector vec_res_value = {0}; + switch (op) { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - vec_res_value = wrapper::vadd(vec_elements, vec_res_value); - break; - case ReductionOperation::SUM_SQUARE: - vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); - break; - case ReductionOperation::PROD: - vec_res_value = wrapper::vmul(vec_elements, vec_res_value); - break; - case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; - break; - } case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; + vec_res_value = wrapper::vloadq(input_ptr + x); break; } - case ReductionOperation::MIN: + case ReductionOperation::PROD: { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_value = wrapper::vdup_n(static_cast(1.f), ExactTagType{}); break; } - case ReductionOperation::MAX: + default: { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_value = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); break; } - default: - ARM_COMPUTE_ERROR("Not supported"); } - } + uint32x4x4_t vec_res_idx{{0}}; - if(op == ReductionOperation::MEAN_SUM) - { - auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast(in_info.dimension(axis)), ExactTagType{})); - vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv); - } - - if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) - { - wrapper::vstore(reinterpret_cast(output.ptr()) + x, vec_res_idx.val[0]); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - if(std::is_same::value) + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) { - wrapper::vstore(reinterpret_cast(output.ptr()) + x + 4, vec_res_idx.val[1]); + const T *in_ptr = + reinterpret_cast(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); + const auto vec_elements = wrapper::vloadq(in_ptr); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::PROD: + vec_res_value = wrapper::vmul(vec_elements, vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = + calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = + calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - } - else - { - wrapper::vstore(reinterpret_cast(output.ptr() + x * sizeof(T)), vec_res_value); - } - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - auto res_value = 0.f; - switch(op) - { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: + if (op == ReductionOperation::MEAN_SUM) { - res_value = *(input_ptr + x); - break; + auto vec_width_inv = + wrapper::vinv(wrapper::vdup_n(static_cast(in_info.dimension(axis)), ExactTagType{})); + vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv); } - case ReductionOperation::PROD: + + if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) { - res_value = static_cast(1.f); - break; + wrapper::vstore(reinterpret_cast(output.ptr()) + x, vec_res_idx.val[0]); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (std::is_same::value) + { + wrapper::vstore(reinterpret_cast(output.ptr()) + x + 4, vec_res_idx.val[1]); + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC } - default: + else { - res_value = static_cast(0.f); - break; + wrapper::vstore(reinterpret_cast(output.ptr() + x * sizeof(T)), vec_res_value); } } - uint32_t res_idx = 0; - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + // Compute left-over elements + for (; x < window_end_x; ++x) { - const T *in_ptr = reinterpret_cast(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); - - switch(op) + auto res_value = 0.f; + switch (op) { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - res_value += *in_ptr; - break; - case ReductionOperation::SUM_SQUARE: - res_value += *in_ptr * *in_ptr; - break; - case ReductionOperation::PROD: - res_value *= *in_ptr; - break; + case ReductionOperation::ARG_IDX_MAX: case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: { - if(*in_ptr < res_value) - { - res_value = *in_ptr; - res_idx = dim; - } + res_value = *(input_ptr + x); break; } - case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::PROD: { - if(*in_ptr > res_value) - { - res_value = *in_ptr; - res_idx = dim; - } + res_value = static_cast(1.f); break; } - case ReductionOperation::MIN: + default: { - res_value = *in_ptr < res_value ? *in_ptr : res_value; + res_value = static_cast(0.f); break; } - case ReductionOperation::MAX: + } + + uint32_t res_idx = 0; + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); + + switch (op) { - res_value = *in_ptr > res_value ? *in_ptr : res_value; - break; + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + res_value += *in_ptr; + break; + case ReductionOperation::SUM_SQUARE: + res_value += *in_ptr * *in_ptr; + break; + case ReductionOperation::PROD: + res_value *= *in_ptr; + break; + case ReductionOperation::ARG_IDX_MIN: + { + if (*in_ptr < res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + if (*in_ptr > res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::MIN: + { + res_value = *in_ptr < res_value ? *in_ptr : res_value; + break; + } + case ReductionOperation::MAX: + { + res_value = *in_ptr > res_value ? *in_ptr : res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); } - } - if(op == ReductionOperation::MEAN_SUM) - { - res_value /= in_info.dimension(axis); - } + if (op == ReductionOperation::MEAN_SUM) + { + res_value /= in_info.dimension(axis); + } - if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) - { - *(reinterpret_cast(output.ptr()) + x) = res_idx; - } - else - { - *(reinterpret_cast(output.ptr() + x * sizeof(T))) = res_value; + if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) + { + *(reinterpret_cast(output.ptr()) + x) = res_idx; + } + else + { + *(reinterpret_cast(output.ptr() + x * sizeof(T))) = res_value; + } } - } - }, - input, output); + }, + input, output); } }; @@ -1107,7 +1142,8 @@ struct RedOpYZW_complex using ExactTagType = typename wrapper::traits::neon_vector::tag_type; using neon_vector = typename wrapper::traits::neon_vector::type; - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation) + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation) { ARM_COMPUTE_ERROR_ON(axis != 2); ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM); @@ -1124,70 +1160,77 @@ struct RedOpYZW_complex Window in_win_no_pad = in_window; in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + in_win_no_pad, + [&](const Coordinates &) { - neon_vector vec_res_value_0 = { 0 }; - neon_vector vec_res_value_1 = { 0 }; - - vec_res_value_0 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - vec_res_value_1 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - - T *out_ptr = reinterpret_cast(output.ptr() + 2 * x * sizeof(T)); - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - T *in_ptr_0 = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); - T *in_ptr_1 = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim); + neon_vector vec_res_value_0 = {0}; + neon_vector vec_res_value_1 = {0}; - const auto vec_elements_0 = wrapper::vloadq(in_ptr_0); - const auto vec_elements_1 = wrapper::vloadq(in_ptr_1); + vec_res_value_0 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + vec_res_value_1 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0); - vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1); - } + T *out_ptr = reinterpret_cast(output.ptr() + 2 * x * sizeof(T)); + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr_0 = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); + T *in_ptr_1 = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim); - wrapper::vstore(out_ptr, vec_res_value_0); - wrapper::vstore(out_ptr + 4, vec_res_value_1); - } + const auto vec_elements_0 = wrapper::vloadq(in_ptr_0); + const auto vec_elements_1 = wrapper::vloadq(in_ptr_1); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - auto res_value_0 = 0.f; - auto res_value_1 = 0.f; + vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0); + vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1); + } - T *out_ptr = reinterpret_cast(output.ptr() + 2 * x * sizeof(T)); - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + wrapper::vstore(out_ptr, vec_res_value_0); + wrapper::vstore(out_ptr + 4, vec_res_value_1); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) { - T *in_ptr = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); - res_value_0 += *in_ptr; - res_value_1 += *(in_ptr + 1); + auto res_value_0 = 0.f; + auto res_value_1 = 0.f; + + T *out_ptr = reinterpret_cast(output.ptr() + 2 * x * sizeof(T)); + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); + res_value_0 += *in_ptr; + res_value_1 += *(in_ptr + 1); + } + *out_ptr = res_value_0; + *(out_ptr + 1) = res_value_1; } - *out_ptr = res_value_0; - *(out_ptr + 1) = res_value_1; - } - }, - input, output); + }, + input, output); } }; template struct RedOpYZW_quantized { - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op) + inline void operator()(const Window &in_window, + Window &out_window, + const ITensor *in, + ITensor *out, + int axis, + const ReductionOperation op) { const TensorInfo in_info = *(in->info()); const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); - using PromotedType = typename wrapper::traits::promote::type>::type; + using PromotedType = typename wrapper::traits::promote::type>::type; const auto oq_info = out->info()->quantization_info().uniform(); @@ -1201,12 +1244,14 @@ struct RedOpYZW_quantized Window in_win_no_pad = in_window; in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); - using vector_type = typename wrapper::traits::neon_bitvector::type; + using vector_type = + typename wrapper::traits::neon_bitvector::type; using vector_type_f = typename wrapper::traits::neon_vector::type; vector_type vec_res_value1{}; @@ -1234,362 +1279,384 @@ struct RedOpYZW_quantized const auto vec_B = wrapper::vdup_n(static_cast(B), wrapper::traits::vector_128_tag{}); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + in_win_no_pad, + [&](const Coordinates &) { - uint32x4x4_t vec_res_idx{ { 0 } }; - vec_res_value1 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - vec_res_value2 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - vec_res_value3 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - vec_res_value4 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + const auto input_ptr = reinterpret_cast(input.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + uint32x4x4_t vec_res_idx{{0}}; + vec_res_value1 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + vec_res_value2 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + vec_res_value3 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + vec_res_value4 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - vec_res_value1_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); - vec_res_value2_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); - vec_res_value3_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); - vec_res_value4_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); + vec_res_value1_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); + vec_res_value2_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); + vec_res_value3_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); + vec_res_value4_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); - auto vec_res_value = wrapper::vloadq(input_ptr + x); + auto vec_res_value = wrapper::vloadq(input_ptr + x); - for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) - { - const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim; - const auto vec_elements = wrapper::vloadq(in_ptr); - switch(op) + for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: + const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim; + const auto vec_elements = wrapper::vloadq(in_ptr); + switch (op) { - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); - vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); - vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); - vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); - break; + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = wrapper::vdup_n(static_cast(iq_info.offset), + wrapper::traits::vector_128_tag{}); + const auto scale32x4f_4 = + wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{}); + + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + auto temp32x4f_1 = wrapper::vcvt(temp32x4t_1); + auto temp32x4f_2 = wrapper::vcvt(temp32x4t_2); + auto temp32x4f_3 = wrapper::vcvt(temp32x4t_3); + auto temp32x4f_4 = wrapper::vcvt(temp32x4t_4); + + //de-quantize vec_elements + temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4); + temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4); + temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4); + temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4); + + vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f); + vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f); + vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f); + vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, + vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, + vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = wrapper::vdup_n(static_cast(iq_info.offset), wrapper::traits::vector_128_tag{}); - const auto scale32x4f_4 = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{}); - - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - auto temp32x4f_1 = wrapper::vcvt(temp32x4t_1); - auto temp32x4f_2 = wrapper::vcvt(temp32x4t_2); - auto temp32x4f_3 = wrapper::vcvt(temp32x4t_3); - auto temp32x4f_4 = wrapper::vcvt(temp32x4t_4); + } - //de-quantize vec_elements - temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4); - temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4); - temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4); - temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4); - - vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f); - vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f); - vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f); - vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f); - break; - } + switch (op) + { case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; - break; - } case ReductionOperation::ARG_IDX_MAX: { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; + wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x), vec_res_idx.val[0]); + wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]); + wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]); + wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 12, + vec_res_idx.val[3]); break; } case ReductionOperation::MIN: - { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } case ReductionOperation::MAX: { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + wrapper::vstore(reinterpret_cast(output.ptr() + x), vec_res_value); break; } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch(op) - { - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::ARG_IDX_MAX: - { - wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x), vec_res_idx.val[0]); - wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]); - wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]); - wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]); - break; - } - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - wrapper::vstore(reinterpret_cast(output.ptr() + x), vec_res_value); - break; - } - case ReductionOperation::SUM: - { - // Subtract offsets - auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset); + case ReductionOperation::SUM: + { + // Subtract offsets + auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset); - auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1); - auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2); - auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3); - auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4); + auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1); + auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2); + auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3); + auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4); - vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets); - vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets); - vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets); - vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets); + vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets); + vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets); + vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets); + vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets); - const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2)); - const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4)); + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4)); - combine_and_store(temp16x8t_1, temp16x8t_2, output, x); - break; - } - case ReductionOperation::MEAN_SUM: - { - vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value1), vec_A); - vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value2), vec_A); - vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value3), vec_A); - vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value4), vec_A); + combine_and_store(temp16x8t_1, temp16x8t_2, output, x); + break; + } + case ReductionOperation::MEAN_SUM: + { + vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value1), vec_A); + vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value2), vec_A); + vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value3), vec_A); + vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value4), vec_A); #ifdef __aarch64__ - vec_res_value1 = wrapper::vcvta(vec_res_value1_f); - vec_res_value2 = wrapper::vcvta(vec_res_value2_f); - vec_res_value3 = wrapper::vcvta(vec_res_value3_f); - vec_res_value4 = wrapper::vcvta(vec_res_value4_f); + vec_res_value1 = wrapper::vcvta(vec_res_value1_f); + vec_res_value2 = wrapper::vcvta(vec_res_value2_f); + vec_res_value3 = wrapper::vcvta(vec_res_value3_f); + vec_res_value4 = wrapper::vcvta(vec_res_value4_f); #else // defined(__aarch64__) - vec_res_value1 = wrapper::vcvt(vec_res_value1_f); - vec_res_value2 = wrapper::vcvt(vec_res_value2_f); - vec_res_value3 = wrapper::vcvt(vec_res_value3_f); - vec_res_value4 = wrapper::vcvt(vec_res_value4_f); + vec_res_value1 = wrapper::vcvt(vec_res_value1_f); + vec_res_value2 = wrapper::vcvt(vec_res_value2_f); + vec_res_value3 = wrapper::vcvt(vec_res_value3_f); + vec_res_value4 = wrapper::vcvt(vec_res_value4_f); #endif // __aarch64__ - const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); - const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); - auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); - wrapper::vstore(reinterpret_cast(output.ptr() + x), res); - break; - } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = wrapper::vdup_n(static_cast(iq_info.offset), wrapper::traits::vector_128_tag{}); - const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale)); - - //re-quantize - vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4); - vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4); - vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4); - vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4); - - vec_res_value1 = wrapper::vcvt(vec_res_value1_f); - vec_res_value2 = wrapper::vcvt(vec_res_value2_f); - vec_res_value3 = wrapper::vcvt(vec_res_value3_f); - vec_res_value4 = wrapper::vcvt(vec_res_value4_f); - - const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); - const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); - auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); - - wrapper::vstore(reinterpret_cast(output.ptr() + x), res); - break; + wrapper::vstore(reinterpret_cast(output.ptr() + x), res); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = + wrapper::vdup_n(static_cast(iq_info.offset), wrapper::traits::vector_128_tag{}); + const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale)); + + //re-quantize + vec_res_value1_f = + wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4); + vec_res_value2_f = + wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4); + vec_res_value3_f = + wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4); + vec_res_value4_f = + wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4); + + vec_res_value1 = wrapper::vcvt(vec_res_value1_f); + vec_res_value2 = wrapper::vcvt(vec_res_value2_f); + vec_res_value3 = wrapper::vcvt(vec_res_value3_f); + vec_res_value4 = wrapper::vcvt(vec_res_value4_f); + + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); + + wrapper::vstore(reinterpret_cast(output.ptr() + x), res); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); } - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - float res_value = 0.f; - int32_t res_value_q = 0; - - switch(op) + // Compute left-over elements + for (; x < window_end_x; ++x) { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - res_value = *(input_ptr + x); - break; - } - case ReductionOperation::PROD: - { - res_value = static_cast(1.0f); - break; - } - default: - { - res_value = static_cast(0.0f); - break; - } - } - uint32_t res_idx = 0; + float res_value = 0.f; + int32_t res_value_q = 0; - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) - { - const T *in_ptr = reinterpret_cast(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim); - switch(op) + switch (op) { - case ReductionOperation::SUM: + case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: { - res_value += *in_ptr; + res_value = *(input_ptr + x); break; } - case ReductionOperation::MEAN_SUM: + case ReductionOperation::PROD: { - res_value_q += *in_ptr; + res_value = static_cast(1.0f); break; } - case ReductionOperation::SUM_SQUARE: + default: { - res_value += *in_ptr * *in_ptr; + res_value = static_cast(0.0f); break; } - case ReductionOperation::PROD: + } + uint32_t res_idx = 0; + + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim); + switch (op) { - //de-quantize input - if(std::is_same::value) + case ReductionOperation::SUM: { - res_value *= dequantize_qasymm8(*in_ptr, iq_info); + res_value += *in_ptr; + break; } - else + case ReductionOperation::MEAN_SUM: { - res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info); + res_value_q += *in_ptr; + break; } - break; - } - case ReductionOperation::ARG_IDX_MIN: - { - if(*in_ptr < res_value) + case ReductionOperation::SUM_SQUARE: { - res_value = *in_ptr; - res_idx = dim; + res_value += *in_ptr * *in_ptr; + break; } - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - if(*in_ptr > res_value) + case ReductionOperation::PROD: { - res_value = *in_ptr; - res_idx = dim; + //de-quantize input + if (std::is_same::value) + { + res_value *= dequantize_qasymm8(*in_ptr, iq_info); + } + else + { + res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info); + } + break; } - break; + case ReductionOperation::ARG_IDX_MIN: + { + if (*in_ptr < res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + if (*in_ptr > res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::MIN: + { + res_value = *in_ptr < res_value ? *in_ptr : res_value; + break; + } + case ReductionOperation::MAX: + { + res_value = *in_ptr > res_value ? *in_ptr : res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - case ReductionOperation::MIN: + } + + switch (op) + { + case ReductionOperation::MEAN_SUM: { - res_value = *in_ptr < res_value ? *in_ptr : res_value; + // Apply previously calculated coefficients (with rounding on aarch64) +#ifdef __aarch64__ + const int32_t res = + arm_compute::support::cpp11::round(A * (static_cast(res_value_q)) + B); +#else // defined(__aarch64__) + const int32_t res = A * (static_cast(res_value_q)) + B; +#endif // __aarch64__ + *reinterpret_cast(output.ptr() + x) = utils::cast::saturate_cast(res); break; } - case ReductionOperation::MAX: + case ReductionOperation::SUM: { - res_value = *in_ptr > res_value ? *in_ptr : res_value; + // Subtract accumulated offsets + res_value -= (in_info.dimension(axis) - 1) * iq_info.offset; + *reinterpret_cast(output.ptr() + x) = utils::cast::saturate_cast(res_value); break; } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch(op) - { - case ReductionOperation::MEAN_SUM: - { - // Apply previously calculated coefficients (with rounding on aarch64) -#ifdef __aarch64__ - const int32_t res = arm_compute::support::cpp11::round(A * (static_cast(res_value_q)) + B); -#else // defined(__aarch64__) - const int32_t res = A * (static_cast(res_value_q)) + B; -#endif // __aarch64__ - *reinterpret_cast(output.ptr() + x) = utils::cast::saturate_cast(res); - break; - } - case ReductionOperation::SUM: - { - // Subtract accumulated offsets - res_value -= (in_info.dimension(axis) - 1) * iq_info.offset; - *reinterpret_cast(output.ptr() + x) = utils::cast::saturate_cast(res_value); - break; - } - case ReductionOperation::PROD: - { - //re-quantize result - T res = 0; - if(std::is_same::value) + case ReductionOperation::PROD: { - res = quantize_qasymm8(res_value, iq_info); + //re-quantize result + T res = 0; + if (std::is_same::value) + { + res = quantize_qasymm8(res_value, iq_info); + } + else + { + res = quantize_qasymm8_signed(res_value, iq_info); + } + *(reinterpret_cast(output.ptr() + x)) = res; + break; } - else + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::ARG_IDX_MAX: { - res = quantize_qasymm8_signed(res_value, iq_info); + *(reinterpret_cast(output.ptr() + x * 4)) = res_idx; + break; } - *(reinterpret_cast(output.ptr() + x)) = res; - break; - } - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::ARG_IDX_MAX: - { - *(reinterpret_cast(output.ptr() + x * 4)) = res_idx; - break; + default: + *(reinterpret_cast(output.ptr() + x)) = res_value; } - default: - *(reinterpret_cast(output.ptr() + x)) = res_value; } - } - }, - input, output); + }, + input, output); } }; -void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op) +void reduce_op( + const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op) { const bool is_complex = (input->info()->num_channels() == 2); - if(is_complex) + if (is_complex) { - switch(axis) + switch (axis) { case 2: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::F32: - switch(op) + switch (op) { case ReductionOperation::SUM: - return Reducer>::reduceZ(window, input, output, RedOpYZW_complex(), op); + return Reducer>::reduceZ( + window, input, output, RedOpYZW_complex(), + op); default: ARM_COMPUTE_ERROR("Not supported"); } @@ -1602,19 +1669,21 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi return; } - switch(axis) + switch (axis) { case 0: { - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: { - return Reducer>::reduceX(window, input, output, RedOpX_quantized(), op); + return Reducer>::reduceX(window, input, output, + RedOpX_quantized(), op); } case DataType::QASYMM8_SIGNED: { - return Reducer>::reduceX(window, input, output, RedOpX_quantized(), op); + return Reducer>::reduceX(window, input, output, RedOpX_quantized(), + op); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: @@ -1635,19 +1704,22 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi } } case 1: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: { - return Reducer>::reduceY(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceY(window, input, output, + RedOpYZW_quantized(), op); } case DataType::QASYMM8_SIGNED: { - return Reducer>::reduceY(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceY(window, input, output, + RedOpYZW_quantized(), op); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer>::reduceY(window, input, output, RedOpYZW(), op); + return Reducer>::reduceY(window, input, output, RedOpYZW(), + op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: return Reducer>::reduceY(window, input, output, RedOpYZW(), op); @@ -1657,15 +1729,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi ARM_COMPUTE_ERROR("Not supported"); } case 2: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: - return Reducer>::reduceZ(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceZ(window, input, output, + RedOpYZW_quantized(), op); case DataType::QASYMM8_SIGNED: - return Reducer>::reduceZ(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceZ(window, input, output, + RedOpYZW_quantized(), op); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer>::reduceZ(window, input, output, RedOpYZW(), op); + return Reducer>::reduceZ(window, input, output, RedOpYZW(), + op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: return Reducer>::reduceZ(window, input, output, RedOpYZW(), op); @@ -1675,15 +1750,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi ARM_COMPUTE_ERROR("Not supported"); } case 3: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: - return Reducer>::reduceW(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceW(window, input, output, + RedOpYZW_quantized(), op); case DataType::QASYMM8_SIGNED: - return Reducer>::reduceW(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceW(window, input, output, + RedOpYZW_quantized(), op); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer>::reduceW(window, input, output, RedOpYZW(), op); + return Reducer>::reduceW(window, input, output, RedOpYZW(), + op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: return Reducer>::reduceW(window, input, output, RedOpYZW(), op); @@ -1704,9 +1782,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - if(input->num_channels() == 1) + if (input->num_channels() == 1) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::S32, DataType::F16, DataType::F32); } else { @@ -1715,13 +1794,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON(axis != 2); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - if(output->total_size() != 0) + if (output->total_size() != 0) { bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN); - if(!is_arg_min_max) + if (!is_arg_min_max) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels()); @@ -1731,8 +1811,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32); } - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); - const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); + const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped); } @@ -1745,7 +1826,10 @@ NEReductionOperationKernel::NEReductionOperationKernel() { } -void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op) +void NEReductionOperationKernel::configure(const ITensor *input, + ITensor *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -1761,14 +1845,23 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output INEKernel::configure(win); // Calculate output shape and set if empty - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); // Output auto initialization if not yet initialized const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX); DataType output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); } -Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status NEReductionOperationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h index 08e654fd21..78bec62c14 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.h +++ b/src/core/NEON/kernels/NEReductionOperationKernel.h @@ -77,7 +77,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp index 1a7f58bb08..f92a4c87da 100644 --- a/src/core/NEON/kernels/NEReorderKernel.cpp +++ b/src/core/NEON/kernels/NEReorderKernel.cpp @@ -24,11 +24,13 @@ #if defined(__aarch64__) #include "src/core/NEON/kernels/NEReorderKernel.h" -#include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/arm_gemm/transform.hpp" + #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/arm_gemm/transform.hpp" + namespace arm_compute { @@ -37,29 +39,32 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { case DataType::F32: { const int ksize_rows_elements = _xmax * _ksize; - const int jump_rows = ksize_rows_elements * window.x().start(); - const int k_start = window.x().start() * _ksize; - const int k_end = std::min(window.x().end() * _ksize, _kmax); - const int stride = _kmax; - if(k_start < k_end) + const int jump_rows = ksize_rows_elements * window.x().start(); + const int k_start = window.x().start() * _ksize; + const int k_end = std::min(window.x().end() * _ksize, _kmax); + const int stride = _kmax; + if (k_start < k_end) { - - switch(_output_wf) + switch (_output_wf) { case WeightFormat::OHWIo4: { - arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(reinterpret_cast(_output->buffer()) + jump_rows, reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); + arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>( + reinterpret_cast(_output->buffer()) + jump_rows, + reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); break; } #if defined(ARM_COMPUTE_ENABLE_SVE) case WeightFormat::OHWIo8: { - arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(reinterpret_cast(_output->buffer()) + jump_rows, reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); + arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>( + reinterpret_cast(_output->buffer()) + jump_rows, + reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); break; } #endif /* ARM_COMPUTE_ENABLE_SVE */ @@ -78,11 +83,20 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info) } NEReorderKernel::NEReorderKernel() - : _input(nullptr), _output(nullptr), _ksize(0), _kmax(0), _xmax(0), _input_wf(WeightFormat::ANY), _output_wf(WeightFormat::ANY) + : _input(nullptr), + _output(nullptr), + _ksize(0), + _kmax(0), + _xmax(0), + _input_wf(WeightFormat::ANY), + _output_wf(WeightFormat::ANY) { } -void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +void NEReorderKernel::configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { ARM_COMPUTE_LOG_PARAMS(input, output, input_wf, output_wf); ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -96,7 +110,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu // Setting parameters for transform auto dims = input->info()->num_dimensions(); - switch(dims) + switch (dims) { case 2: { @@ -120,7 +134,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu // Window size is set by rows / _ksize Window win; int window_size = 0; - switch(_output_wf) + switch (_output_wf) { #if defined(ARM_COMPUTE_ENABLE_SVE) case WeightFormat::OHWIo8: @@ -142,7 +156,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu break; } } - if(_kmax % _ksize != 0) + if (_kmax % _ksize != 0) { window_size += 1; } @@ -152,11 +166,14 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu INEKernel::configure(win); } -Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +Status NEReorderKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -167,20 +184,20 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou int output_x_dim; int output_k_dim; auto dims = output->num_dimensions(); - switch(dims) + switch (dims) { case 2: { - input_x_dim = input->dimension(0); // Number of columns in input matrix - input_k_dim = input->dimension(1); // Number of rows in input matrix + input_x_dim = input->dimension(0); // Number of columns in input matrix + input_k_dim = input->dimension(1); // Number of rows in input matrix output_x_dim = output->dimension(0); // Number of columns in output matrix output_k_dim = output->dimension(1); // Number of rows in output matrix break; } case 4: { - input_x_dim = input->dimension(2); // Number of columns in input matrix - input_k_dim = input->dimension(3); // Number of rows in input matrix + input_x_dim = input->dimension(2); // Number of columns in input matrix + input_k_dim = input->dimension(3); // Number of rows in input matrix output_x_dim = output->dimension(2); // Number of columns in output matrix output_k_dim = output->dimension(3); // Number of rows in output matrix break; @@ -192,7 +209,7 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou } int ksize; - switch(output_wf) + switch (output_wf) { case WeightFormat::OHWIo8: { @@ -216,11 +233,10 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_kdim != output_k_dim); // output x_dim needs to be same as input ARM_COMPUTE_RETURN_ERROR_ON(input_x_dim != output_x_dim); - } return Status{}; } } // namespace arm_compute -#endif // defined(__aarch64__) \ No newline at end of file +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/NEReorderKernel.h b/src/core/NEON/kernels/NEReorderKernel.h index 07908890f4..4528b25245 100644 --- a/src/core/NEON/kernels/NEReorderKernel.h +++ b/src/core/NEON/kernels/NEReorderKernel.h @@ -26,9 +26,10 @@ #ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL #define ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL -#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" +#include "src/core/NEON/INEKernel.h" + namespace arm_compute { @@ -36,7 +37,6 @@ namespace arm_compute class NEReorderKernel : public INEKernel { public: - const char *name() const override { return "NEReorderKernel"; @@ -62,7 +62,10 @@ public: * @param[in] input_wf WeightFormat of input. * @param[in] output_wf WeightFormat of output. */ - void configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf); + void configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf); /** Static function to check if given info will lead to a valid configuration of @ref NEReorderKernel * @@ -73,25 +76,27 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; - -/*****************************************************************************/ + /*****************************************************************************/ private: - const ITensor *_input{nullptr}; // Input tensor - ITensor *_output{nullptr}; // Output tensor - int32_t _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call - int32_t _kmax{0}; // Rows in input tensor - int32_t _xmax{0}; // Columns in input tensor - WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor - WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor + const ITensor *_input{nullptr}; // Input tensor + ITensor *_output{nullptr}; // Output tensor + int32_t _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call + int32_t _kmax{0}; // Rows in input tensor + int32_t _xmax{0}; // Columns in input tensor + WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor + WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor }; } // namespace arm_compute #endif /* ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL */ -#endif // defined(__aarch64__) \ No newline at end of file +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp index a7b830c066..227570405c 100644 --- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp +++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,13 +51,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, + "The width of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, + "The height of the input tensor must be a multiple of stride"); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); + const TensorInfo tensor_info_output = + output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -65,8 +69,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -NEReorgLayerKernel::NEReorgLayerKernel() - : _input(nullptr), _output(nullptr), _stride(1) +NEReorgLayerKernel::NEReorgLayerKernel() : _input(nullptr), _output(nullptr), _stride(1) { } @@ -121,23 +124,26 @@ void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info) Iterator out(_output, collapsed_window); // Perform reorg - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - // Get spatial coords and channels - const unsigned int w = id[idx_w]; - const unsigned int h = id[idx_h]; - const unsigned int c = id[idx_c]; - - // Calculate mapping - const unsigned int offset = c / out_c; - Coordinates map_coords = id; - map_coords.set(idx_w, w * stride + offset % stride); - map_coords.set(idx_h, h * stride + offset / stride); - map_coords.set(idx_c, c % out_c); - - // Perform mapping - std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size()); - }, - out); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + // Get spatial coords and channels + const unsigned int w = id[idx_w]; + const unsigned int h = id[idx_h]; + const unsigned int c = id[idx_c]; + + // Calculate mapping + const unsigned int offset = c / out_c; + Coordinates map_coords = id; + map_coords.set(idx_w, w * stride + offset % stride); + map_coords.set(idx_h, h * stride + offset / stride); + map_coords.set(idx_c, c % out_c); + + // Perform mapping + std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), + _input->info()->element_size()); + }, + out); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp index ca6c117882..d2437eecd0 100644 --- a/src/core/NEON/kernels/NEReverseKernel.cpp +++ b/src/core/NEON/kernels/NEReverseKernel.cpp @@ -26,15 +26,17 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) { ARM_COMPUTE_UNUSED(use_inverted_axis); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis); @@ -42,11 +44,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Current implementation only supports up to 4 dimensions."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, + "Current implementation only supports up to 4 dimensions."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -57,8 +60,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -NEReverseKernel::NEReverseKernel() - : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false) +NEReverseKernel::NEReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false) { } @@ -80,7 +82,10 @@ void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITe INEKernel::configure(calculate_max_window(*output->info())); } -Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) +Status NEReverseKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + bool use_inverted_axis) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis)); @@ -88,29 +93,30 @@ Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *ou } template -void run_reverse(const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis) +void run_reverse( + const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis) { unsigned int axis_bit = 0; const int rank = input->info()->num_dimensions(); - for(unsigned int i = 0; i < axis->info()->dimension(0); ++i) + for (unsigned int i = 0; i < axis->info()->dimension(0); ++i) { int axis_i = *(reinterpret_cast(axis->buffer()) + i); // The values of axis tensor must be between [-rank, rank-1]. - if((axis_i < -rank) || (axis_i >= rank)) + if ((axis_i < -rank) || (axis_i >= rank)) { ARM_COMPUTE_ERROR("the valuses of the axis tensor must be within [-rank, rank-1]."); } // In case of negative axis value i.e targeted axis(i) = rank + axis(i) - if(axis_i < 0) + if (axis_i < 0) { axis_i = rank + axis_i; } // Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis - if(use_inverted_axis) + if (use_inverted_axis) { axis_i = (rank - 1) - axis_i; } @@ -127,43 +133,47 @@ void run_reverse(const Window &window, const ITensor *input, const ITensor *axis win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input_it(input, win); - execute_window_loop(win, [&](const Coordinates & id) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - auto in = wrapper::vloadq(reinterpret_cast(input_it.ptr()) + x); - - // Reverse 0 axis - if(axis_bit & 0x1) + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - in = wrapper::vrev64(in); - in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in)); + auto in = wrapper::vloadq(reinterpret_cast(input_it.ptr()) + x); + + // Reverse 0 axis + if (axis_bit & 0x1) + { + in = wrapper::vrev64(in); + in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in)); + } + + const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x; + const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); + const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); + const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; + + auto out_ptr = + reinterpret_cast(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))); + wrapper::vstore(out_ptr, in); } - const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x; - const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); - const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); - const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; - - auto out_ptr = reinterpret_cast(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))); - wrapper::vstore(out_ptr, in); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto in = *(reinterpret_cast(input_it.ptr()) + x); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto in = *(reinterpret_cast(input_it.ptr()) + x); - const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x; - const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); - const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); - const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; + const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x; + const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); + const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); + const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; - *reinterpret_cast(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = in; - } - }, - input_it); + *reinterpret_cast(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = + in; + } + }, + input_it); } void NEReverseKernel::run(const Window &window, const ThreadInfo &info) @@ -172,7 +182,7 @@ void NEReverseKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_input->info()->element_size()) + switch (_input->info()->element_size()) { case 4: run_reverse(window, _input, _axis, _output, _use_inverted_axis); diff --git a/src/core/NEON/kernels/NEReverseKernel.h b/src/core/NEON/kernels/NEReverseKernel.h index 7d9ec4691c..92261887f4 100644 --- a/src/core/NEON/kernels/NEReverseKernel.h +++ b/src/core/NEON/kernels/NEReverseKernel.h @@ -68,7 +68,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp index b8c9b244ee..7789b828ea 100644 --- a/src/core/NEON/kernels/NESelectKernel.cpp +++ b/src/core/NEON/kernels/NESelectKernel.cpp @@ -29,13 +29,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" - +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/select/list.h" #include @@ -54,7 +53,8 @@ struct SelectKernelSelectorData }; using SelectorPtr = std::add_pointer::type; -using KernelPtr = std::add_pointer::type; +using KernelPtr = + std::add_pointer::type; struct SelectKernelSelector { @@ -63,95 +63,62 @@ struct SelectKernelSelector KernelPtr ukernel; }; -static const SelectKernelSelector available_kernels[] = -{ - { - "neon_s8_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank) - }, - { - "neon_s16_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank) - }, - { - "neon_s32_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank) - }, - { - "neon_u8_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank) - }, - { - "neon_u16_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank) - }, - { - "neon_u32_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank) - }, - { - "neon_s8_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank) - }, - { - "neon_s16_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank) - }, - { - "neon_s32_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank) - }, - { - "neon_u8_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank) - }, - { - "neon_u16_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank) - }, - { - "neon_u32_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank) - }, - { - "neon_f16_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank) - }, - { - "neon_f16_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank) - }, - { - "neon_f32_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank) - }, - { - "neon_f32_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank) - }, +static const SelectKernelSelector available_kernels[] = { + {"neon_s8_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)}, + {"neon_s16_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)}, + {"neon_s32_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)}, + {"neon_u8_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)}, + {"neon_u16_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)}, + {"neon_u32_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)}, + {"neon_s8_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)}, + {"neon_s16_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)}, + {"neon_s32_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)}, + {"neon_u8_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)}, + {"neon_u16_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)}, + {"neon_u32_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)}, + {"neon_f16_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == true; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)}, + {"neon_f16_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == false; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)}, + {"neon_f32_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == true; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)}, + {"neon_f32_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == false; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)}, }; const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -184,7 +151,8 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor INEKernel::configure(win); } -Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) +Status +NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x); @@ -195,9 +163,11 @@ Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, cons const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape())); - ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); + ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && + ((c->tensor_shape().num_dimensions() > 1) || + (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output); @@ -214,7 +184,7 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON(_output == nullptr); ARM_COMPUTE_ERROR_ON(_output->info() == nullptr); - const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank }); + const auto *uk = get_implementation(SelectKernelSelectorData{_output->info()->data_type(), _has_same_rank}); ARM_COMPUTE_ERROR_ON(uk == nullptr); ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); uk->ukernel(_c, _x, _y, _output, window); diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h index e82105a68e..4fec42b536 100644 --- a/src/core/NEON/kernels/NESelectKernel.h +++ b/src/core/NEON/kernels/NESelectKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NESELECTKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -82,7 +83,6 @@ public: void run(const Window &window, const ThreadInfo &info) override; private: - const ITensor *_c; /**< Condition tensor */ const ITensor *_x; /**< Source tensor 1 */ const ITensor *_y; /**< Source tensor 2 */ diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp index 673eace3c1..da023aeb96 100644 --- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp +++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include @@ -41,19 +42,22 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *block_info, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2}); ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2}); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); @@ -64,7 +68,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status validate_arguments_static(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); @@ -73,9 +81,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right); + TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input, block_shape_x, block_shape_y, padding_left, padding_right); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -86,14 +95,25 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape } // namespace NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel() - : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _padding_left(), _block_shape_x(), _block_shape_y() + : _input(nullptr), + _block_shape(nullptr), + _paddings(nullptr), + _output(nullptr), + _data_layout(DataLayout::UNKNOWN), + _padding_left(), + _block_shape_x(), + _block_shape_y() { } -void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output) +void NESpaceToBatchLayerKernel::configure(const ITensor *input, + const ITensor *block_shape, + const ITensor *paddings, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); _input = input; _block_shape = block_shape; @@ -106,15 +126,22 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *b ICPPKernel::configure(win); } -void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, - ITensor *output) +void NESpaceToBatchLayerKernel::configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input->info(), block_shape_x, block_shape_y, padding_left, padding_right); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, + padding_right, output->info())); _input = input; _output = output; @@ -128,15 +155,23 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_ INEKernel::configure(win); } -Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output)); return Status{}; } -Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -146,17 +181,17 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - if(_block_shape != nullptr) + if (_block_shape != nullptr) { // Retrieve the block shapes dynamically _block_shape_x = *(reinterpret_cast(_block_shape->ptr_to_element(0))); _block_shape_y = *(reinterpret_cast(_block_shape->ptr_to_element(1))); } - if(_paddings != nullptr) + if (_paddings != nullptr) { - const size_t pad_left_x = *reinterpret_cast(_paddings->ptr_to_element({ 0, 0 })); - const size_t pad_left_y = *reinterpret_cast(_paddings->ptr_to_element({ 1, 0 })); + const size_t pad_left_x = *reinterpret_cast(_paddings->ptr_to_element({0, 0})); + const size_t pad_left_y = *reinterpret_cast(_paddings->ptr_to_element({1, 0})); _padding_left = Size2D(pad_left_x, pad_left_y); } const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); @@ -173,57 +208,61 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info int batch_id = 0; // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t out_x = id.x(); - const size_t out_y = id.y(); - const size_t z = id.z(); - const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; - const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; - if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + execute_window_loop( + slice_out, + [&](const Coordinates &id) { - const int w = batch_id % batch_size; - const int in_x = pos_x - _padding_left.x(); - const int in_y = pos_y - _padding_left.y(); - Coordinates input_coords{ in_x, in_y, z, w }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - } - }, - out); + const size_t out_x = id.x(); + const size_t out_y = id.y(); + const size_t z = id.z(); + const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; + const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; + if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && + pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + { + const int w = batch_id % batch_size; + const int in_x = pos_x - _padding_left.x(); + const int in_y = pos_y - _padding_left.y(); + Coordinates input_coords{in_x, in_y, z, w}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + } + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } else { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t out_x = id.y(); - const size_t out_y = id.z(); - const size_t z = id.x(); - const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; - const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; - if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + execute_window_loop( + slice_out, + [&](const Coordinates &id) { - const int w = batch_id % batch_size; - const int in_x = pos_x - _padding_left.x(); - const int in_y = pos_y - _padding_left.y(); - Coordinates input_coords{ z, in_x, in_y, w }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - } - }, - out); + const size_t out_x = id.y(); + const size_t out_y = id.z(); + const size_t z = id.x(); + const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; + const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; + if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && + pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + { + const int w = batch_id % batch_size; + const int in_x = pos_x - _padding_left.x(); + const int in_y = pos_y - _padding_left.y(); + Coordinates input_coords{z, in_x, in_y, w}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + } + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h index 44b8cbb514..6292c07136 100644 --- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h +++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -69,7 +70,12 @@ public: * @param[in] padding_right The padding at the end of every dimension of the output tensor. * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output); + void configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -79,7 +85,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings) * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -91,7 +100,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp index 7687c50c40..b49c5ee344 100644 --- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp +++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include @@ -50,7 +51,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -115,43 +116,45 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info int batch_id = 0; // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t channel_id = id.z(); - const size_t in_x = id.x() * _block_shape + (channel_id / channel_size) % _block_shape; - const size_t in_y = id.y() * _block_shape + (channel_id / channel_size) / _block_shape; - const int z = channel_id % channel_size; - Coordinates input_coords{ in_x, in_y, z, batch_id }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const size_t channel_id = id.z(); + const size_t in_x = id.x() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = id.y() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{in_x, in_y, z, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } else { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t channel_id = id.x(); - const size_t in_x = id.y() * _block_shape + (channel_id / channel_size) % _block_shape; - const size_t in_y = id.z() * _block_shape + (channel_id / channel_size) / _block_shape; - const int z = channel_id % channel_size; - Coordinates input_coords{ z, in_x, in_y, batch_id }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const size_t channel_id = id.x(); + const size_t in_x = id.y() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = id.z() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{z, in_x, in_y, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h index 953b68a401..7d147c5b94 100644 --- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h +++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp index 93080e2ac7..e23b40a9aa 100644 --- a/src/core/NEON/kernels/NEStackLayerKernel.cpp +++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp @@ -25,13 +25,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -41,7 +41,11 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. @@ -50,9 +54,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + compute_stack_shape(*input, axis, num_tensors)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); } @@ -60,7 +65,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) +std::pair +validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors))); @@ -71,11 +77,12 @@ std::pair validate_and_configure_window(ITensorInfo *input, unsi return std::make_pair(Status{}, win); } -inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input) +inline Coordinates +shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input) { constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D Coordinates id_out = id; - for(unsigned int i = max_out_coord - 1; i > axis; --i) + for (unsigned int i = max_out_coord - 1; i > axis; --i) { id_out.set(i, id[i - 1]); } @@ -84,12 +91,12 @@ inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, } } // namespace -NEStackLayerKernel::NEStackLayerKernel() - : _input(nullptr), _output(nullptr), _axis(), _idx_input() +NEStackLayerKernel::NEStackLayerKernel() : _input(nullptr), _output(nullptr), _axis(), _idx_input() { } -void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output) +void NEStackLayerKernel::configure( + const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info())); @@ -106,10 +113,15 @@ void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsi INEKernel::configure(win_config.second); } -Status NEStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status NEStackLayerKernel::validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); return Status{}; } @@ -131,12 +143,15 @@ void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info) const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0; const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0; - execute_window_loop(window, [&](const Coordinates & id) - { - Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input); - const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k; - std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size()); - }, - input); + execute_window_loop( + window, + [&](const Coordinates &id) + { + Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input); + const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + + id_out[4] * stride_k; + std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size()); + }, + input); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h index 9b36518e4d..685812b56d 100644 --- a/src/core/NEON/kernels/NEStackLayerKernel.h +++ b/src/core/NEON/kernels/NEStackLayerKernel.h @@ -26,6 +26,7 @@ #define ARM_COMPUTE_NESTACKLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -64,7 +65,8 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. * */ - void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output); + void configure( + const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel * * @note Supported input tensor rank: up to 4 @@ -78,7 +80,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output); // Inherited methods overridden void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp index 2b406a8b8b..efff51be9d 100644 --- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp +++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp @@ -26,9 +26,10 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -38,9 +39,14 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); @@ -49,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) - { - return i == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; })); // Get expected output shape - const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0); // Checks output if configured - if(output->total_size() != 0) + if (output->total_size() != 0) { const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info); @@ -71,14 +74,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, return Status{}; } -std::pair validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +std::pair validate_and_configure_window(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { // Output tensor auto initialization if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape)); // Create window @@ -88,38 +95,49 @@ std::pair validate_and_configure_window(const ITensorInfo *input } } // namespace -NEStridedSliceKernel::NEStridedSliceKernel() - : _starts_abs(), _final_strides(), _shrink_mask() +NEStridedSliceKernel::NEStridedSliceKernel() : _starts_abs(), _final_strides(), _shrink_mask() { } -void NEStridedSliceKernel::configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void NEStridedSliceKernel::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); _shrink_mask = shrink_axis_mask; const TensorShape &input_shape = input->tensor_shape(); Coordinates ends_abs; - std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords( - input_shape, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + std::tie(_starts_abs, ends_abs, _final_strides) = + arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides, + begin_mask, end_mask, shrink_axis_mask); // Configure kernel window - auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + auto win_config = + validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); } -Status NEStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSliceKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), - starts, ends, strides, begin_mask, end_mask, shrink_axis_mask) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), starts, ends, + strides, begin_mask, end_mask, shrink_axis_mask) + .first); return Status{}; } @@ -156,7 +174,7 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co size_t length_x = win.shape()[0]; - if(_final_strides[0] == 1 && !is_shrink_x) + if (_final_strides[0] == 1 && !is_shrink_x) { win.set(Window::DimX, Window::Dimension(0, 1, 1)); width_size = width_size * length_x; @@ -183,16 +201,17 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co uint8_t *cur_ptr; execute_window_loop( - win, [&](const Coordinates & id) - { - cur_ptr = input_base; - cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0; - cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1; - cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2; - cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3; - - std::copy_n(cur_ptr, width_size, output_it.ptr()); - }, - output_it); + win, + [&](const Coordinates &id) + { + cur_ptr = input_base; + cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0; + cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1; + cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2; + cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3; + + std::copy_n(cur_ptr, width_size, output_it.ptr()); + }, + output_it); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.h b/src/core/NEON/kernels/NEStridedSliceKernel.h index 9ce517417d..a475f09a17 100644 --- a/src/core/NEON/kernels/NEStridedSliceKernel.h +++ b/src/core/NEON/kernels/NEStridedSliceKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" #include @@ -68,9 +69,14 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - void configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + void configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel * @@ -86,9 +92,14 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp index 94256dc12d..577ce5b69e 100644 --- a/src/core/NEON/kernels/NETileKernel.cpp +++ b/src/core/NEON/kernels/NETileKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,15 +44,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4); ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) - { - return e == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; })); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -59,8 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -NETileKernel::NETileKernel() - : _input(nullptr), _output(nullptr) +NETileKernel::NETileKernel() : _input(nullptr), _output(nullptr) { } @@ -95,8 +93,9 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - Window output_window{ window }; - output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), _input->info()->dimension(0))); + Window output_window{window}; + output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); Window out_slice = output_window.first_slice_window_1D(); const auto src_shape = _input->info()->tensor_shape(); @@ -104,17 +103,19 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info) { Iterator output_it(_output, out_slice); - execute_window_loop(out_slice, [&](const Coordinates & id) - { - const size_t x = id.x(); - const size_t y = id.y(); - const size_t z = id.z(); - const size_t w = id[3]; - Coordinates input_coords{ x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3] }; - memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), _input->info()->dimension(0) * _input->info()->element_size()); - }, - output_it); - } - while(output_window.slide_window_slice_1D(out_slice)); + execute_window_loop( + out_slice, + [&](const Coordinates &id) + { + const size_t x = id.x(); + const size_t y = id.y(); + const size_t z = id.z(); + const size_t w = id[3]; + Coordinates input_coords{x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3]}; + memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), + _input->info()->dimension(0) * _input->info()->element_size()); + }, + output_it); + } while (output_window.slide_window_slice_1D(out_slice)); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp index dbd47ccfa9..13c2d314e4 100644 --- a/src/core/NEON/kernels/assembly/depthwise.hpp +++ b/src/core/NEON/kernels/assembly/depthwise.hpp @@ -38,9 +38,8 @@ struct DepthwiseConfig DepthwiseMethod method = DepthwiseMethod::DEFAULT; std::string filter = ""; - DepthwiseConfig(DepthwiseMethod method) - : method(method) {}; - DepthwiseConfig() {}; + DepthwiseConfig(DepthwiseMethod method) : method(method){}; + DepthwiseConfig(){}; }; struct DepthwiseArgs @@ -63,18 +62,24 @@ struct DepthwiseArgs bool fast_mode = false; - DepthwiseArgs( - const CPUInfo *cpu_info, - unsigned int kernel_rows, unsigned int kernel_cols, - unsigned int stride_rows, unsigned int stride_cols, - unsigned int dilation_rows, unsigned int dilation_cols, - unsigned int n_batches, unsigned int input_rows, unsigned int input_cols, - unsigned int input_channels, - unsigned int output_rows, unsigned int output_cols, - unsigned int channel_multiplier, - PaddingValues padding, arm_gemm::Activation activation, - - const DepthwiseConfig *config) + DepthwiseArgs(const CPUInfo *cpu_info, + unsigned int kernel_rows, + unsigned int kernel_cols, + unsigned int stride_rows, + unsigned int stride_cols, + unsigned int dilation_rows, + unsigned int dilation_cols, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, + unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, + arm_gemm::Activation activation, + + const DepthwiseConfig *config) : cpu_info(cpu_info), kernel_rows(kernel_rows), kernel_cols(kernel_cols), @@ -95,20 +100,38 @@ struct DepthwiseArgs { } - DepthwiseArgs( - const CPUInfo *cpu_info, - unsigned int kernel_rows, unsigned int kernel_cols, - unsigned int stride_rows, unsigned int stride_cols, - unsigned int n_batches, unsigned int input_rows, unsigned int input_cols, - unsigned int input_channels, - unsigned int output_rows, unsigned int output_cols, - unsigned int channel_multiplier, - PaddingValues padding, arm_gemm::Activation activation, - const DepthwiseConfig *config) - : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows, - stride_cols, 1, 1, n_batches, input_rows, input_cols, - input_channels, output_rows, output_cols, - channel_multiplier, padding, activation, config) + DepthwiseArgs(const CPUInfo *cpu_info, + unsigned int kernel_rows, + unsigned int kernel_cols, + unsigned int stride_rows, + unsigned int stride_cols, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, + unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, + arm_gemm::Activation activation, + const DepthwiseConfig *config) + : DepthwiseArgs(cpu_info, + kernel_rows, + kernel_cols, + stride_rows, + stride_cols, + 1, + 1, + n_batches, + input_rows, + input_cols, + input_channels, + output_rows, + output_cols, + channel_multiplier, + padding, + activation, + config) { } }; @@ -127,17 +150,18 @@ struct Tile { } - Tile() - : Tile(nullptr, 0, 0, 0) + Tile() : Tile(nullptr, 0, 0, 0) { } - void load_from( - const TInput *input, - const unsigned int ld_row, const unsigned int ld_col, - const unsigned int n_rows, const unsigned int n_cols, - const int input_i, const int input_j, - const unsigned int channel_multiplier) const + void load_from(const TInput *input, + const unsigned int ld_row, + const unsigned int ld_col, + const unsigned int n_rows, + const unsigned int n_cols, + const int input_i, + const int input_j, + const unsigned int channel_multiplier) const { const auto pad_top = input_i < 0 ? -input_i : 0; const auto pad_left = input_j < 0 ? -input_j : 0; @@ -145,18 +169,15 @@ struct Tile const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top; const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left; - if(padded_rows < tile_rows || padded_cols < tile_cols) + if (padded_rows < tile_rows || padded_cols < tile_cols) { memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput)); } - do_premultiply( - (TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, - ld_row, ld_col, - array + pad_top * tile_cols * tile_channels + pad_left * tile_channels, - tile_cols * tile_channels, tile_channels, - padded_rows, padded_cols, tile_channels / channel_multiplier, - channel_multiplier); + do_premultiply((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row, + ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels, + tile_cols * tile_channels, tile_channels, padded_rows, padded_cols, + tile_channels / channel_multiplier, channel_multiplier); } }; @@ -168,9 +189,8 @@ protected: std::string m_name{}; public: - DepthwiseCommon(const DepthwiseArgs &args) - : m_args(args) {}; - DepthwiseCommon(DepthwiseCommon &) = delete; + DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){}; + DepthwiseCommon(DepthwiseCommon &) = delete; DepthwiseCommon &operator=(DepthwiseCommon &) = delete; std::string name() const override @@ -181,19 +201,18 @@ public: void set_name(std::string name) { // Only allow the name to be set once - if(m_name.empty()) + if (m_name.empty()) { m_name = name; } } - void execute( - const void *const input, - const void *const parameters, - void *const output, - void *const working_space, - const unsigned int thread_id, - const unsigned int n_threads) const override final + void execute(const void *const input, + const void *const parameters, + void *const output, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override final { const size_t ld_input_col = m_args.input_channels; const size_t ld_input_row = ld_input_col * m_args.input_cols; @@ -202,56 +221,47 @@ public: const size_t ld_output_row = ld_output_col * m_args.output_cols; const size_t ld_output_batch = ld_output_row * m_args.output_rows; - execute( - input, ld_input_col, ld_input_row, ld_input_batch, - parameters, output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, n_threads); + execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row, + ld_output_batch, working_space, thread_id, n_threads); } - void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *const parameters, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *const working_space, - const unsigned int thread_id, - const unsigned int n_threads) const override final + void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *const parameters, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override final { - execute( - m_args.n_batches, m_args.input_rows, m_args.input_cols, - m_args.input_channels, m_args.padding, - input, ld_input_col, ld_input_row, ld_input_batch, - parameters, - m_args.output_rows, m_args.output_cols, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, n_threads); + execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input, + ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output, + ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads); } - void execute( - unsigned int batches, - unsigned int input_height, - unsigned int input_width, - unsigned int channels, - const PaddingValues &padding, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - unsigned int output_height, - unsigned int output_width, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const override final + void execute(unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &padding, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const override final { // Construct a new set of arguments to reflect that we might have been // passed different input/output tensors. Dilation is handled at this @@ -271,38 +281,33 @@ public: auto ld_output_col_d = ld_output_col * m_args.dilation_cols; auto ld_output_row_d = ld_output_row * m_args.dilation_rows; - for(size_t drow = 0; drow < m_args.dilation_rows; drow++) + for (size_t drow = 0; drow < m_args.dilation_rows; drow++) { size_t start_i; - std::tie(args.output_rows, args.input_rows, start_i, - args.padding.top, args.padding.bottom) = - get_reduced_view_for_dilation( - output_height, input_height, drow, m_args.dilation_rows, - m_args.kernel_rows, m_args.stride_rows, padding.top); + std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) = + get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows, + m_args.kernel_rows, m_args.stride_rows, padding.top); auto input_row = static_cast(input) + start_i * ld_input_row; auto output_row = static_cast(output) + drow * ld_output_row; - if(args.output_rows) + if (args.output_rows) { - for(size_t dcol = 0; dcol < m_args.dilation_cols; dcol++) + for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++) { size_t start_j; - std::tie(args.output_cols, args.input_cols, start_j, - args.padding.left, args.padding.right) = - get_reduced_view_for_dilation( - output_width, input_width, dcol, m_args.dilation_cols, - m_args.kernel_cols, m_args.stride_cols, padding.left); + std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) = + get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols, + m_args.kernel_cols, m_args.stride_cols, padding.left); const TInput *input_col = input_row + start_j * ld_input_col; TOutput *output_col = output_row + dcol * ld_output_col; - if(args.output_cols) + if (args.output_cols) { - this->execute_internal( - args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters, - output_col, ld_output_col_d, ld_output_row_d, ld_output_batch, - working_space, thread_id, n_threads); + this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, + parameters, output_col, ld_output_col_d, ld_output_row_d, + ld_output_batch, working_space, thread_id, n_threads); } } } @@ -310,20 +315,19 @@ public: } protected: - virtual void execute_internal( - const DepthwiseArgs &instance_args, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; + virtual void execute_internal(const DepthwiseArgs &instance_args, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; virtual bool uses_premultiply() const { diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp index a5db793b3d..5ff848e281 100644 --- a/src/core/NEON/kernels/assembly/depthwise_common.hpp +++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp @@ -49,11 +49,7 @@ struct KernelDescription bool is_default = false; uint64_t cycle_estimate = 0; - KernelDescription( - DepthwiseMethod method, - std::string name, - bool is_default, - uint64_t cycle_estimate) + KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate) : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate) { } @@ -78,58 +74,51 @@ public: // pointer the bias vector (which may be nullptr in the case of no bias) and // a pointer to the array of weights (stored in HWIO order). virtual void pack_parameters( - void *buffer, - const void *biases, - const void *weights, - size_t ld_weight_col = 0, - size_t ld_weight_row = 0) = 0; + void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0; // Determine the amount of working space required virtual size_t get_working_size(unsigned int n_threads) const = 0; // Execute the convolution over the specified area of memory. - virtual void execute( - const void *input, // Pointer to input tensor - const void *parameters, // Packed parameters buffer - void *output, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; - - virtual void execute( - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; - - virtual void execute( - unsigned int batches, - unsigned int input_height, - unsigned int input_width, - unsigned int channels, - const PaddingValues &, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - unsigned int output_height, - unsigned int output_width, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; + virtual void execute(const void *input, // Pointer to input tensor + const void *parameters, // Packed parameters buffer + void *output, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute(const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute(unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; // To handle a dilation factor of D execute the kernel once for each d in @@ -145,12 +134,13 @@ public: // - Number of valid input pixels corresponding to `d` // - Offset of the first pixel corresponding to `d` // - Amount of padding in the view for `d` -std::tuple -get_reduced_view_for_dilation( - size_t out_size, size_t in_size, - size_t d, size_t dilation_factor, - size_t kernel_size, size_t stride, - size_t pad_before); +std::tuple get_reduced_view_for_dilation(size_t out_size, + size_t in_size, + size_t d, + size_t dilation_factor, + size_t kernel_size, + size_t stride, + size_t pad_before); } // namespace depthwise } // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp index f1f70cf1d6..045f9f95d3 100644 --- a/src/core/NEON/kernels/assembly/pool_common.hpp +++ b/src/core/NEON/kernels/assembly/pool_common.hpp @@ -68,45 +68,42 @@ public: virtual size_t get_working_size(unsigned int num_threads) const = 0; // Execute pooling over the specified area of memory. - virtual void execute( - const void *const input, - void *const output, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(const void *const input, + void *const output, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; - virtual void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; - virtual void execute( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const PaddingValues &, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const PaddingValues &, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; }; } // namespace pooling diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp index e8db35c593..89d594298e 100644 --- a/src/core/NEON/kernels/assembly/pooling.hpp +++ b/src/core/NEON/kernels/assembly/pooling.hpp @@ -36,9 +36,8 @@ struct PoolingConfig PoolingMethod method = PoolingMethod::DEFAULT; std::string filter = ""; - PoolingConfig(PoolingMethod method) - : method(method) {}; - PoolingConfig() {}; + PoolingConfig(PoolingMethod method) : method(method){}; + PoolingConfig(){}; }; struct PoolingArgs @@ -57,30 +56,40 @@ struct PoolingArgs const PoolingConfig *config; - PoolingArgs( - const CPUInfo *cpu_info, - PoolingType pool_type, - const PoolingWindow &window, - const PoolingStride &stride, - bool exclude_padding, - unsigned int n_batches, - unsigned int input_rows, - unsigned int input_cols, - unsigned int n_channels, - unsigned int output_rows, - unsigned int output_cols, - const PaddingValues &padding, - const PoolingConfig *cfg) - : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols), - n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg) + PoolingArgs(const CPUInfo *cpu_info, + PoolingType pool_type, + const PoolingWindow &window, + const PoolingStride &stride, + bool exclude_padding, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int n_channels, + unsigned int output_rows, + unsigned int output_cols, + const PaddingValues &padding, + const PoolingConfig *cfg) + : cpu_info(cpu_info), + pool_type(pool_type), + pool_window(window), + pool_stride(stride), + exclude_padding(exclude_padding), + n_batches(n_batches), + input_rows(input_rows), + input_cols(input_cols), + n_channels(n_channels), + output_rows(output_rows), + output_cols(output_cols), + padding(padding), + config(cfg) { // If either of the pooling window dimensions are set to zero, meaning // "pool everything", then replace with the corresponding input dimension. - if(pool_window.rows == 0) + if (pool_window.rows == 0) { pool_window.rows = input_rows; } - if(pool_window.cols == 0) + if (pool_window.cols == 0) { pool_window.cols = input_cols; } @@ -100,10 +109,16 @@ struct Requantize32 int32_t per_layer_right_shift = 0; int32_t per_layer_mul = 0; - Requantize32(int32_t input_offset, int32_t output_offset, - int32_t per_layer_left_shift, int32_t per_layer_right_shift, + Requantize32(int32_t input_offset, + int32_t output_offset, + int32_t per_layer_left_shift, + int32_t per_layer_right_shift, int32_t per_layer_mul) - : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul) + : input_offset(input_offset), + output_offset(output_offset), + per_layer_left_shift(per_layer_left_shift), + per_layer_right_shift(per_layer_right_shift), + per_layer_mul(per_layer_mul) { } }; @@ -115,105 +130,88 @@ protected: const PoolingArgs m_args; public: - PoolingCommon(const PoolingArgs &args) - : m_args(args) + PoolingCommon(const PoolingArgs &args) : m_args(args) { } - PoolingCommon(PoolingCommon &) = delete; + PoolingCommon(PoolingCommon &) = delete; PoolingCommon &operator=(PoolingCommon &) = delete; size_t get_working_size(unsigned int) const override = 0; // Execute pooling over the specified area of memory. - void execute( - const void *const input, - void *const output, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(const void *const input, + void *const output, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute( - input, - m_args.n_channels, - m_args.n_channels * m_args.input_cols, - m_args.n_channels * m_args.input_cols * m_args.input_rows, - output, - m_args.n_channels, - m_args.n_channels * m_args.output_cols, - m_args.n_channels * m_args.output_cols * m_args.output_rows, - working_space, - thread_id, num_threads); + this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols, + m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels, + m_args.n_channels * m_args.output_cols, + m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id, + num_threads); } - void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute( - m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, - input, ld_input_col, ld_input_row, ld_input_batch, - m_args.padding, m_args.output_rows, m_args.output_cols, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, num_threads); + this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col, + ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output, + ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads); } - void execute( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const PaddingValues &padding, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const PaddingValues &padding, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute_internal( - batches, height, width, channels, padding, - input, ld_input_col, ld_input_row, ld_input_batch, - output_height, output_width, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, num_threads); + this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row, + ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row, + ld_output_batch, working_space, thread_id, num_threads); } protected: - virtual void execute_internal( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const PaddingValues &, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute_internal(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const PaddingValues &, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; }; template diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp index 16f26de38a..fb97cf8baf 100644 --- a/src/core/NEON/kernels/assembly/premultiply.hpp +++ b/src/core/NEON/kernels/assembly/premultiply.hpp @@ -44,30 +44,27 @@ void do_premultiply(const T *in_ptr, const unsigned input_channels, const unsigned int channel_multiplier) { - if(sizeof(T) == 4 && channel_multiplier == 6) + if (sizeof(T) == 4 && channel_multiplier == 6) { - do_premultiply_float_6( - (const float *)in_ptr, ld_row, ld_col, - (float *)out_ptr, out_ld_row, out_ld_col, - tile_rows, tile_cols, - input_channels); + do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col, + tile_rows, tile_cols, input_channels); } else { - for(unsigned int i = 0; i < tile_rows; i++) + for (unsigned int i = 0; i < tile_rows; i++) { const T *ip2 = in_ptr + i * ld_row; T *op2 = out_ptr + i * out_ld_row; - for(unsigned int j = 0; j < tile_cols; j++) + for (unsigned int j = 0; j < tile_cols; j++) { const T *ip = ip2; T *op = op2; - for(unsigned int c = 0; c < input_channels; c++) + for (unsigned int c = 0; c < input_channels; c++) { T val = *ip; ip++; - for(unsigned int r = 0; r < channel_multiplier; r++) + for (unsigned int r = 0; r < channel_multiplier; r++) { op[r] = val; } diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp index 50290757ec..dbf95d23cd 100644 --- a/src/core/NEON/kernels/assembly/winograd.hpp +++ b/src/core/NEON/kernels/assembly/winograd.hpp @@ -45,17 +45,24 @@ struct ConvolutionArgs Shape2D kernel_shape; arm_gemm::Activation activation; - ConvolutionArgs( - unsigned int n_batches, - const Shape2D &input_shape, - unsigned int n_input_channels, - unsigned int pad_top, unsigned int pad_left, - const Shape2D &output_shape, - unsigned int n_output_channels, - const Shape2D kernel_shape, - const arm_gemm::Activation &activation = {}) - : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels), - kernel_shape(kernel_shape), activation(activation) + ConvolutionArgs(unsigned int n_batches, + const Shape2D &input_shape, + unsigned int n_input_channels, + unsigned int pad_top, + unsigned int pad_left, + const Shape2D &output_shape, + unsigned int n_output_channels, + const Shape2D kernel_shape, + const arm_gemm::Activation &activation = {}) + : n_batches(n_batches), + input_shape(input_shape), + n_input_channels(n_input_channels), + pad_top(pad_top), + pad_left(pad_left), + output_shape(output_shape), + n_output_channels(n_output_channels), + kernel_shape(kernel_shape), + activation(activation) { } }; @@ -105,23 +112,30 @@ public: virtual unsigned int get_transformed_tile_rows(void) const = 0; virtual unsigned int get_transformed_tile_cols(void) const = 0; - void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, - void *outptr, const WinogradDomainSpec &wds, - unsigned int thread_id, unsigned int n_threads) const + void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_row, + size_t ld_in_col, + size_t ld_input_channel, + void *outptr, + const WinogradDomainSpec &wds, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, inptr, ld_in_row, ld_in_col, ld_input_channel, - outptr, wds.weight_ld_matrix, wds.weight_ld_row, - thread_id, n_threads); + this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix, + wds.weight_ld_row, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, - void *outptr, size_t ld_out_matrix, size_t ld_out_row, - unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_row, + size_t ld_in_col, + size_t ld_input_channel, + void *outptr, + size_t ld_out_matrix, + size_t ld_out_row, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace weight_transform @@ -136,27 +150,35 @@ public: virtual unsigned int get_input_rows(void) const = 0; virtual unsigned int get_input_cols(void) const = 0; - virtual size_t get_working_space_size( - const ConvolutionArgs &args, - unsigned int n_threads) const = 0; - - void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col, - void *outptr, const WinogradDomainSpec &wds, - void *working_space, unsigned int thread_id, unsigned int n_threads) const + virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0; + + void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_row, + size_t ld_in_col, + void *outptr, + const WinogradDomainSpec &wds, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, inptr, ld_in_batch, ld_in_row, ld_in_col, - outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row, - working_space, thread_id, n_threads); + this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix, + wds.input_ld_row, working_space, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col, - void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row, - void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_row, + size_t ld_in_col, + void *outptr, + size_t ld_out_batch, + size_t ld_out_matrix, + size_t ld_out_row, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace input_transform @@ -177,31 +199,37 @@ public: virtual unsigned int get_kernel_rows(void) const = 0; virtual unsigned int get_kernel_cols(void) const = 0; - virtual size_t get_working_space_size( - const ConvolutionArgs &args, - unsigned int n_threads) const = 0; - - void execute( - const ConvolutionArgs &args, - const void *inptr, const WinogradDomainSpec &wds, - const void *bias, - void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, - void *working_space, unsigned int thread_id, unsigned int n_threads) const + virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0; + + void execute(const ConvolutionArgs &args, + const void *inptr, + const WinogradDomainSpec &wds, + const void *bias, + void *outptr, + size_t ld_out_batch, + size_t ld_out_row, + size_t ld_out_col, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, - inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, - bias, - outptr, ld_out_batch, ld_out_row, ld_out_col, - working_space, thread_id, n_threads); + this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr, + ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, - const void *bias, - void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, - void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_matrix, + size_t ld_in_row, + const void *bias, + void *outptr, + size_t ld_out_batch, + size_t ld_out_row, + size_t ld_out_col, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace output_transform @@ -210,7 +238,7 @@ struct WinogradImpl { const output_transform::ITransform *output_transform = nullptr; const weight_transform::ITransform *weight_transform = nullptr; - const input_transform::ITransform *input_transform = nullptr; + const input_transform::ITransform *input_transform = nullptr; std::unique_ptr gemm_args; WinogradDomainSpec winograd_spec; }; @@ -220,15 +248,18 @@ struct WinogradImpl * Assigns to the pointers in the `dest` struct and returns true or false to * indicate whether the given problem can be executed or not. */ -template -bool get_implementation( - WinogradImpl &dest, // Destination for the selected implementation - const CPUInfo *, - const ConvolutionArgs &, - int max_threads, - bool fast_mode, - const WinogradConfig *, - const arm_gemm::GemmConfig *); +template +bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation + const CPUInfo *, + const ConvolutionArgs &, + int max_threads, + bool fast_mode, + const WinogradConfig *, + const arm_gemm::GemmConfig *); } // namespace winograd } // namespace arm_conv diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp index ed5254a0a4..e3d9b670b3 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp @@ -24,8 +24,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" + #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" #include @@ -37,12 +38,26 @@ namespace arm_compute { namespace { -using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window); +using BatchNomalizationPtr = void (*)(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window); template -void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; @@ -57,86 +72,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; T activation_functor(act_info); const auto epsilon_vec = wrapper::vdup_n(static_cast(epsilon), ExactTagType{}); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Perform core calculations using vector operations - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = wrapper::vloadq(input_mean + x); - const auto var_vec = wrapper::vloadq(input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast(1.f), ExactTagType{}); - const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - - // Calculate denominator - const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - - // Calculate x bar - const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); - const auto x_bar = wrapper::vmul(numerator, denominator); - auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); - - // Perform fused activation - if(act_info.enabled()) + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Perform core calculations using vector operations + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - activation_functor(res); + // Conctruct vectors + const auto mean_vec = wrapper::vloadq(input_mean + x); + const auto var_vec = wrapper::vloadq(input_var + x); + const auto gamma_vec = (input_gamma != nullptr) + ? wrapper::vloadq(input_gamma + x) + : wrapper::vdup_n(static_cast(1.f), ExactTagType{}); + const auto beta_vec = (input_beta != nullptr) + ? wrapper::vloadq(input_beta + x) + : wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + + // Calculate denominator + const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + + // Calculate x bar + const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); + const auto x_bar = wrapper::vmul(numerator, denominator); + auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + wrapper::vstore(output_ptr + x, res); } - // Store results - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Conctruct vectors - const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; - const float16_t beta = (input_beta != nullptr) ? input_beta[x] : 0.f; - - const float16_t denominator = sqrt(input_var[x] + epsilon); - const float16_t numerator = input_ptr[x] - input_mean[x]; - const float16_t x_bar = numerator / denominator; - float16_t res = beta + x_bar * gamma; - - // Perform fused activation - if(act_info.enabled()) + // Compute left-over elements + for (; x < window_end_x; ++x) { - activation_functor(res); + // Conctruct vectors + const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; + const float16_t beta = (input_beta != nullptr) ? input_beta[x] : 0.f; + + const float16_t denominator = sqrt(input_var[x] + epsilon); + const float16_t numerator = input_ptr[x] - input_mean[x]; + const float16_t x_bar = numerator / denominator; + float16_t res = beta + x_bar * gamma; + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + *reinterpret_cast(output_ptr + x) = res; } - - // Store results - *reinterpret_cast(output_ptr + x) = res; - } - }, - input, output); + }, + input, output); } // Fused Batched Normalization with activation functions -static std::map fused_map = -{ - { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization> } -}; -} +static std::map fused_map = { + {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization>}}; +} // namespace namespace cpu { -void fp16_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp16_neon_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { - if(act_info.enabled()) + if (act_info.enabled()) { fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window); } diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp index d6e22e1843..4e1654ee6b 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp @@ -24,8 +24,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" + #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" #include @@ -36,12 +37,26 @@ namespace arm_compute { namespace { -using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window); +using BatchNomalizationPtr = void (*)(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window); template -void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; @@ -56,86 +71,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; T activation_functor(act_info); const auto epsilon_vec = wrapper::vdup_n(static_cast(epsilon), ExactTagType{}); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Perform core calculations using vector operations - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = wrapper::vloadq(input_mean + x); - const auto var_vec = wrapper::vloadq(input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast(1.f), ExactTagType{}); - const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - - // Calculate denominator - const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - - // Calculate x bar - const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); - const auto x_bar = wrapper::vmul(numerator, denominator); - auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); - - // Perform fused activation - if(act_info.enabled()) + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Perform core calculations using vector operations + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - activation_functor(res); + // Conctruct vectors + const auto mean_vec = wrapper::vloadq(input_mean + x); + const auto var_vec = wrapper::vloadq(input_var + x); + const auto gamma_vec = (input_gamma != nullptr) + ? wrapper::vloadq(input_gamma + x) + : wrapper::vdup_n(static_cast(1.f), ExactTagType{}); + const auto beta_vec = (input_beta != nullptr) + ? wrapper::vloadq(input_beta + x) + : wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + + // Calculate denominator + const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + + // Calculate x bar + const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); + const auto x_bar = wrapper::vmul(numerator, denominator); + auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + wrapper::vstore(output_ptr + x, res); } - // Store results - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Conctruct vectors - const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; - const float beta = (input_beta != nullptr) ? input_beta[x] : 0.f; - - const float denominator = sqrt(input_var[x] + epsilon); - const float numerator = input_ptr[x] - input_mean[x]; - const float x_bar = numerator / denominator; - float res = beta + x_bar * gamma; - - // Perform fused activation - if(act_info.enabled()) + // Compute left-over elements + for (; x < window_end_x; ++x) { - activation_functor(res); + // Conctruct vectors + const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; + const float beta = (input_beta != nullptr) ? input_beta[x] : 0.f; + + const float denominator = sqrt(input_var[x] + epsilon); + const float numerator = input_ptr[x] - input_mean[x]; + const float x_bar = numerator / denominator; + float res = beta + x_bar * gamma; + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + *reinterpret_cast(output_ptr + x) = res; } - - // Store results - *reinterpret_cast(output_ptr + x) = res; - } - }, - input, output); + }, + input, output); } // Fused Batched Normalization with activation functions -static std::map fused_map = -{ - { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization> } -}; -} +static std::map fused_map = { + {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization>}}; +} // namespace namespace cpu { -void fp32_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp32_neon_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { - if(act_info.enabled()) + if (act_info.enabled()) { fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window); } diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp index 98cd9aa7fe..48caaa3e63 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/SVEMath.h" #include @@ -37,8 +38,15 @@ namespace arm_compute { namespace cpu { -void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp16_sve_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); @@ -49,69 +57,74 @@ void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; const auto epsilon_vec = svdup_n_f16(epsilon); const auto const_1 = svdup_n_f16(1.f); const auto const_0 = svdup_n_f16(0.f); const auto va = svdup_n_f16(act_info.a()); const auto vb = svdup_n_f16(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = svld1_f16(pg, input_mean + x); - const auto var_vec = svld1_f16(pg, input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1; - const auto beta_vec = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0; + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - // Calculate denominator - const auto tmp = svadd_f16_z(pg, var_vec, epsilon_vec); - auto denominator = svrsqrte_f16(tmp); - denominator = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); - denominator = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + // Conctruct vectors + const auto mean_vec = svld1_f16(pg, input_mean + x); + const auto var_vec = svld1_f16(pg, input_var + x); + const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1; + const auto beta_vec = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0; - // Calculate x bar - const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec); - const auto x_bar = svmul_f16_z(pg, numerator, denominator); - auto res = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec); + // Calculate denominator + const auto tmp = svadd_f16_z(pg, var_vec, epsilon_vec); + auto denominator = svrsqrte_f16(tmp); + denominator = + svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); + denominator = + svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); - // Perform fused activation - if(act_info.enabled()) - { - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) - { - res = svmax_f16_z(pg, const_0, res); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res)); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Calculate x bar + const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec); + const auto x_bar = svmul_f16_z(pg, numerator, denominator); + auto res = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) { - res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res)); + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + { + res = svmax_f16_z(pg, const_0, res); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res)); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res)); + } } - } - // Store results - svst1_f16(pg, output_ptr + x, res); + // Store results + svst1_f16(pg, output_ptr + x, res); - x += svcntw(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - input, output); + x += svcntw(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(svptrue_b16(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp index 952ab320bf..df4fbfe607 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/SVEMath.h" #include @@ -37,8 +38,15 @@ namespace arm_compute { namespace cpu { -void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp32_sve_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); @@ -49,69 +57,74 @@ void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; const auto epsilon_vec = svdup_n_f32(epsilon); const auto const_1 = svdup_n_f32(1.f); const auto const_0 = svdup_n_f32(0.f); const auto va = svdup_n_f32(act_info.a()); const auto vb = svdup_n_f32(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b32(x, window_end_x); - do + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = svld1_f32(pg, input_mean + x); - const auto var_vec = svld1_f32(pg, input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1; - const auto beta_vec = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0; + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - // Calculate denominator - const auto tmp = svadd_f32_z(pg, var_vec, epsilon_vec); - auto denominator = svrsqrte_f32(tmp); - denominator = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); - denominator = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do + { + // Conctruct vectors + const auto mean_vec = svld1_f32(pg, input_mean + x); + const auto var_vec = svld1_f32(pg, input_var + x); + const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1; + const auto beta_vec = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0; - // Calculate x bar - const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec); - const auto x_bar = svmul_f32_z(pg, numerator, denominator); - auto res = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec); + // Calculate denominator + const auto tmp = svadd_f32_z(pg, var_vec, epsilon_vec); + auto denominator = svrsqrte_f32(tmp); + denominator = + svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); + denominator = + svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); - // Perform fused activation - if(act_info.enabled()) - { - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) - { - res = svmax_f32_z(pg, const_0, res); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res)); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Calculate x bar + const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec); + const auto x_bar = svmul_f32_z(pg, numerator, denominator); + auto res = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) { - res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res)); + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + { + res = svmax_f32_z(pg, const_0, res); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res)); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res)); + } } - } - // Store results - svst1_f32(pg, output_ptr + x, res); + // Store results + svst1_f32(pg, output_ptr + x, res); - x += svcntw(); - pg = svwhilelt_b32(x, window_end_x); - } - while(svptest_any(svptrue_b32(), pg)); - }, - input, output); + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); + } while (svptest_any(svptrue_b32(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h index 8e0ea36f5a..cbf540bd71 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/list.h +++ b/src/core/NEON/kernels/batchnormalization/impl/list.h @@ -28,9 +28,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name) \ - void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, \ - float epsilon, ActivationLayerInfo &act_info, const Window &window) +#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name) \ + void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, \ + const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window) DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_neon_batch_normalization); DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization); diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h index 3900ea62cd..95cdc8f2f9 100644 --- a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h +++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -158,8 +159,7 @@ struct logistic * * @param[in] act_info Activation layer information. */ - explicit logistic(ActivationLayerInfo act_info) - : vone(wrapper::vdup_n(static_cast(1), ExactTagType{})) + explicit logistic(ActivationLayerInfo act_info) : vone(wrapper::vdup_n(static_cast(1), ExactTagType{})) { ARM_COMPUTE_UNUSED(act_info); } @@ -198,8 +198,7 @@ struct relu * * @param[in] act_info Activation layer information. */ - explicit relu(ActivationLayerInfo act_info) - : vzero(wrapper::vdup_n(static_cast(0), ExactTagType{})) + explicit relu(ActivationLayerInfo act_info) : vzero(wrapper::vdup_n(static_cast(0), ExactTagType{})) { ARM_COMPUTE_UNUSED(act_info); } diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl index ac196d9dbb..50fff04cad 100644 --- a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl +++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IMultiImage.h" #include "arm_compute/core/Utils.h" + #include "src/core/NEON/NEMath.h" #include @@ -50,8 +51,12 @@ constexpr float rgb2u8_red_coef = 0.2126f; constexpr float rgb2u8_green_coef = 0.7152f; constexpr float rgb2u8_blue_coef = 0.0722f; -inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor, - const float rcoef, const float gcoef, const float bcoef) +inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, + const float32x4_t &gcolor, + const float32x4_t &bcolor, + const float rcoef, + const float gcoef, + const float bcoef) { float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef); greyscale = vmlaq_n_f32(greyscale, gcolor, gcoef); @@ -86,8 +91,12 @@ inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out) arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out); } -inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec, - float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec) +inline void rgb_to_yuv_calculation(const float32x4_t &rvec, + const float32x4_t &gvec, + const float32x4_t &bvec, + float32x4_t &yvec, + float32x4_t &uvec, + float32x4_t &vvec) { /* Y'= 0.2126*R' + 0.7152*G' + 0.0722*B' @@ -110,8 +119,12 @@ inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &g vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv); } -inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val, - float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha) +inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, + float32x4_t uvec_val, + const float32x4_t &yyvec_val, + float32x4_t vvec_val, + unsigned char *output_ptr, + const bool alpha) { float32x4x3_t rgb1, rgb2; @@ -126,8 +139,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve // b = 1.8556f*f_u + 0.0000f*f_v; const auto red = vmulq_n_f32(vvec_val, red_coef_bt709); const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709); - const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), - vmulq_n_f32(vvec_val, green_coef2_bt709)); + const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), vmulq_n_f32(vvec_val, green_coef2_bt709)); // Compute the final r,g,b values using y1 for the first texel and y2 for the second one. // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t @@ -144,7 +156,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve uint8x8x3_t u8_rgb; arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb); - if(!alpha) + if (!alpha) { vst3_lane_u8(&output_ptr[0], u8_rgb, 0); vst3_lane_u8(&output_ptr[3], u8_rgb, 4); @@ -177,7 +189,7 @@ inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha) { uint8x16x3_t rgb; - if(alpha) + if (alpha) { const auto tmp = vld4q_u8(ptr); rgb.val[0] = tmp.val[0]; @@ -206,12 +218,12 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto float32x4x4_t fyvec_top, fuvec_top, fvvec_top; float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom; - for(auto i = 0; i < 4; ++i) + for (auto i = 0; i < 4; ++i) { - rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], - fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]); - rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], - fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]); + rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], fyvec_top.val[i], fuvec_top.val[i], + fvvec_top.val[i]); + rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], fyvec_bottom.val[i], + fuvec_bottom.val[i], fvvec_bottom.val[i]); } arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]); @@ -222,9 +234,14 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]); } -inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, - const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, - unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, +inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, + const uint8x16_t &gvec_top, + const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, + const uint8x16_t &gvec_bottom, + const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, + unsigned char *const __restrict out_y_bottom, unsigned char *const __restrict out_uv) { uint8x16x3_t vec_top, vec_bottom; @@ -252,9 +269,14 @@ inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec vst2_u8(out_uv, uvvec); } -inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, - const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, - unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, +inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, + const uint8x16_t &gvec_top, + const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, + const uint8x16_t &gvec_bottom, + const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, + unsigned char *const __restrict out_y_bottom, unsigned char *const __restrict out_u, unsigned char *const __restrict out_v) { @@ -273,14 +295,16 @@ inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]); const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]); - const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), - vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1])); + const auto uvvec = + vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1])); vst1_u8(out_u, vget_low_u8(uvvec)); vst1_u8(out_v, vget_high_u8(uvvec)); } -inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec, +inline void store_rgb_to_yuv4(const uint8x16_t &rvec, + const uint8x16_t &gvec, + const uint8x16_t &bvec, unsigned char *const __restrict out_y, unsigned char *const __restrict out_u, unsigned char *const __restrict out_v) @@ -291,10 +315,9 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec); float32x4x4_t fyvec, fuvec, fvvec; - for(auto i = 0; i < 4; ++i) + for (auto i = 0; i < 4; ++i) { - rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], - fyvec.val[i], fuvec.val[i], fvvec.val[i]); + rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], fyvec.val[i], fuvec.val[i], fvvec.val[i]); } uint8x16_t yvec, uvec, vvec; @@ -307,7 +330,7 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co vst1q_u8(out_v, vvec); } #endif /* DOXYGEN_SKIP_THIS */ -} +} // namespace namespace arm_compute { @@ -329,17 +352,19 @@ void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict out Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld3q_u8(in.ptr()); - uint8x16x4_t ta2; - ta2.val[0] = ta1.val[0]; - ta2.val[1] = ta1.val[1]; - ta2.val[2] = ta1.val[2]; - ta2.val[3] = vdupq_n_u8(255); - vst4q_u8(out.ptr(), ta2); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta1 = vld3q_u8(in.ptr()); + uint8x16x4_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + ta2.val[3] = vdupq_n_u8(255); + vst4q_u8(out.ptr(), ta2); + }, + in, out); } /** Convert RGB to U8. @@ -360,14 +385,16 @@ void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict outpu Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld3q_u8(in.ptr()); - uint8x16_t ta2; - rgb_to_u8_conversion(ta1, ta2); - vst1q_u8(out.ptr(), ta2); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta1 = vld3q_u8(in.ptr()); + uint8x16_t ta2; + rgb_to_u8_conversion(ta1, ta2); + vst1q_u8(out.ptr(), ta2); + }, + in, out); } /** Convert RGBX to RGB. @@ -388,16 +415,18 @@ void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld4q_u8(in.ptr()); - uint8x16x3_t ta2; - ta2.val[0] = ta1.val[0]; - ta2.val[1] = ta1.val[1]; - ta2.val[2] = ta1.val[2]; - vst3q_u8(out.ptr(), ta2); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta1 = vld4q_u8(in.ptr()); + uint8x16x3_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + vst3q_u8(out.ptr(), ta2); + }, + in, out); } /** Convert YUYV to RGB. @@ -422,26 +451,32 @@ void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict out Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta = vld4q_u8(in.ptr()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - // Convert the uint8x16x4_t to float32x4x4_t - const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]); - const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]); - const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]); - const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]); - - yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta = vld4q_u8(in.ptr()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + // Convert the uint8x16x4_t to float32x4x4_t + const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]); + const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]); + const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]); + const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]); + + yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, + alpha); + yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, + alpha); + yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, + alpha); + yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, + alpha); + }, + in, out); } /** Convert NV12 to RGB. @@ -475,35 +510,45 @@ void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict out Iterator in_uv(input_ptr->plane(1), win_uv); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]); - - yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - - yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); - }, - in_y, in_uv, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]); + + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], + out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], + out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], + out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], + out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], + out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], + out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], + out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], + out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_uv, out); } /** Convert IYUV to RGB. @@ -537,59 +582,71 @@ void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict out Iterator in_v(input_ptr->plane(2), win_uv); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto *y_top_ptr = in_y.ptr(); - const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y(); - const auto *u_ptr = in_u.ptr(); - const auto *v_ptr = in_v.ptr(); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto *y_top_ptr = in_y.ptr(); + const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y(); + const auto *u_ptr = in_u.ptr(); + const auto *v_ptr = in_v.ptr(); // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation #if defined(__arch64__) - const auto ta0_y_top = vld1q_u8(y_top_ptr); - const auto ta1_y_top = vld1q_u8(y_top_ptr + 16); - const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr); - const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16); - const auto ta_u = vld1q_u8(u_ptr); - const auto ta_v = vld1q_u8(v_ptr); - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top)); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top)); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom)); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom)); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); + const auto ta0_y_top = vld1q_u8(y_top_ptr); + const auto ta1_y_top = vld1q_u8(y_top_ptr + 16); + const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr); + const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16); + const auto ta_u = vld1q_u8(u_ptr); + const auto ta_v = vld1q_u8(v_ptr); + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top)); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top)); + float32x4x4_t yvec_bottom = + arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom)); + float32x4x4_t yyvec_bottom = + arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom)); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); #else /* defined(__arch64__) */ - const auto ta_y_top = vld2q_u8(y_top_ptr); - const auto ta_y_bottom = vld2q_u8(y_bottom_ptr); - const auto ta_u = vld1q_u8(u_ptr); - const auto ta_v = vld1q_u8(v_ptr); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_u.val[0] = U0 U2 U4 U6 ... - //ta_v.val[0] = V0 V2 V4 V6 ... - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); + const auto ta_y_top = vld2q_u8(y_top_ptr); + const auto ta_y_bottom = vld2q_u8(y_bottom_ptr); + const auto ta_u = vld1q_u8(u_ptr); + const auto ta_v = vld1q_u8(v_ptr); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u.val[0] = U0 U2 U4 U6 ... + //ta_v.val[0] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); #endif /* defined(__arch64__) */ - yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - - yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); - }, - in_y, in_u, in_v, out); + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], + out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], + out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], + out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], + out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], + out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], + out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], + out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], + out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_u, in_v, out); } /** Convert YUYV to NV12. @@ -621,31 +678,33 @@ void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict ou Iterator out_y(output_ptr->plane(0), win); Iterator out_uv(output_ptr->plane(1), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_top = vld4q_u8(in.ptr()); - const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - uint8x16x2_t yvec; - yvec.val[0] = ta_top.val[0 + shift]; - yvec.val[1] = ta_top.val[2 + shift]; - vst2q_u8(out_y.ptr(), yvec); - - uint8x16x2_t yyvec; - yyvec.val[0] = ta_bottom.val[0 + shift]; - yyvec.val[1] = ta_bottom.val[2 + shift]; - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); - - uint8x16x2_t uvvec; - uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); - uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); - vst2q_u8(out_uv.ptr(), uvvec); - }, - in, out_y, out_uv); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16x2_t uvvec; + uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst2q_u8(out_uv.ptr(), uvvec); + }, + in, out_y, out_uv); } /** Convert IYUV to NV12. @@ -676,23 +735,25 @@ void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict ou Iterator out_y(output_ptr->plane(0), win); Iterator out_uv(output_ptr->plane(1), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - uint8x16x2_t ta_uv; - ta_uv.val[0] = vld1q_u8(in_u.ptr()); - ta_uv.val[1] = vld1q_u8(in_v.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - vst2q_u8(out_uv.ptr(), ta_uv); - }, - in_y, in_u, in_v, out_y, out_uv); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + uint8x16x2_t ta_uv; + ta_uv.val[0] = vld1q_u8(in_u.ptr()); + ta_uv.val[1] = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst2q_u8(out_uv.ptr(), ta_uv); + }, + in_y, in_u, in_v, out_y, out_uv); } /** Convert NV12 to IYUV. @@ -726,22 +787,24 @@ void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win_uv); Iterator out_v(output_ptr->plane(2), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]); - vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]); - }, - in_y, in_uv, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]); + vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]); + }, + in_y, in_uv, out_y, out_u, out_v); } /** Convert YUYV to IYUV. @@ -774,34 +837,36 @@ void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win_uv); Iterator out_v(output_ptr->plane(2), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_top = vld4q_u8(in.ptr()); - const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - uint8x16x2_t yvec; - yvec.val[0] = ta_top.val[0 + shift]; - yvec.val[1] = ta_top.val[2 + shift]; - vst2q_u8(out_y.ptr(), yvec); - - uint8x16x2_t yyvec; - yyvec.val[0] = ta_bottom.val[0 + shift]; - yyvec.val[1] = ta_bottom.val[2 + shift]; - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); - - uint8x16_t uvec; - uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); - vst1q_u8(out_u.ptr(), uvec); - - uint8x16_t vvec; - vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); - vst1q_u8(out_v.ptr(), vvec); - }, - in, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16_t uvec; + uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + vst1q_u8(out_u.ptr(), uvec); + + uint8x16_t vvec; + vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst1q_u8(out_v.ptr(), vvec); + }, + in, out_y, out_u, out_v); } /** Convert NV12 to YUV4. @@ -835,32 +900,34 @@ void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win); Iterator out_v(output_ptr->plane(2), win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - - uint8x16x2_t uvec; - uvec.val[0] = ta_uv.val[0 + shift]; - uvec.val[1] = ta_uv.val[0 + shift]; - vst2q_u8(out_u.ptr(), uvec); - vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); - - uint8x16x2_t vvec; - vvec.val[0] = ta_uv.val[1 - shift]; - vvec.val[1] = ta_uv.val[1 - shift]; - vst2q_u8(out_v.ptr(), vvec); - vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); - }, - in_y, in_uv, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_uv.val[0 + shift]; + uvec.val[1] = ta_uv.val[0 + shift]; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_uv.val[1 - shift]; + vvec.val[1] = ta_uv.val[1 - shift]; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_uv, out_y, out_u, out_v); } /** Convert IYUV to YUV4. @@ -892,33 +959,35 @@ void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win); Iterator out_v(output_ptr->plane(2), win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_u = vld1q_u8(in_u.ptr()); - const auto ta_v = vld1q_u8(in_v.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_u = U0 U2 U4 U6 ... - //ta_v = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - - uint8x16x2_t uvec; - uvec.val[0] = ta_u; - uvec.val[1] = ta_u; - vst2q_u8(out_u.ptr(), uvec); - vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); - - uint8x16x2_t vvec; - vvec.val[0] = ta_v; - vvec.val[1] = ta_v; - vst2q_u8(out_v.ptr(), vvec); - vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); - }, - in_y, in_u, in_v, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_u = vld1q_u8(in_u.ptr()); + const auto ta_v = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u = U0 U2 U4 U6 ... + //ta_v = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_u; + uvec.val[1] = ta_u; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_v; + vvec.val[1] = ta_v; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_u, in_v, out_y, out_u, out_v); } /** Convert RGB to NV12. @@ -948,20 +1017,21 @@ void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict out Iterator out_y(output_ptr->plane(0), win); Iterator out_uv(output_ptr->plane(1), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb_top = load_rgb(in.ptr(), alpha); - const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], - ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], - out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), - out_uv.ptr()); - }, - in, out_y, out_uv); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0], + ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(), + out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_uv.ptr()); + }, + in, out_y, out_uv); } /** Convert RGB to IYUV. @@ -992,20 +1062,22 @@ void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict out Iterator out_u(output_ptr->plane(1), win_uv); Iterator out_v(output_ptr->plane(2), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb_top = load_rgb(in.ptr(), alpha); - const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], - ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], - out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), - out_u.ptr(), out_v.ptr()); - }, - in, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0], + ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(), + out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_u.ptr(), + out_v.ptr()); + }, + in, out_y, out_u, out_v); } /** Convert RGB to YUV4. @@ -1030,16 +1102,17 @@ void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict out Iterator out_u(output_ptr->plane(1), win); Iterator out_v(output_ptr->plane(2), win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb = load_rgb(in.ptr(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], - out_y.ptr(), out_u.ptr(), out_v.ptr()); - }, - in, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_rgb = load_rgb(in.ptr(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], out_y.ptr(), out_u.ptr(), out_v.ptr()); + }, + in, out_y, out_u, out_v); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h index 96defbc9c9..4b1eb079b2 100644 --- a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h +++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h @@ -33,56 +33,32 @@ namespace detail { inline float32x4x3_t load_matrix_row(const float *ptr) { - const float32x4x3_t r = - { - { - vld1q_dup_f32(ptr), - vld1q_dup_f32(1 + ptr), - vld1q_dup_f32(2 + ptr) - } - }; + const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}}; return r; } template -float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2); +float32x4x2_t convolve_3x3(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2); template <> -inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +inline float32x4x2_t convolve_3x3<1>(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2) { - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + 4), - vld1q_f32(in_top + 8) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + 4), - vld1q_f32(in_mid + 8) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + 4), - vld1q_f32(in_low + 8) - } - }; - float32x4x2_t out = - { - { - vmulq_f32(vtop.val[0], m0.val[0]), - vmulq_f32(vtop.val[1], m0.val[0]) - } - }; - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); + const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}}; + const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}}; + const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}}; + float32x4x2_t out = {{vmulq_f32(vtop.val[0], m0.val[0]), vmulq_f32(vtop.val[1], m0.val[0])}}; + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); @@ -106,7 +82,12 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +inline float32x4x2_t convolve_3x3<2>(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2) { float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); @@ -116,7 +97,12 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +inline float32x4x2_t convolve_3x3<3>(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2) { float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); @@ -165,6 +151,6 @@ int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteratio { return num_elems_written_per_iteration * 3; } -} +} // namespace detail } // namespace arm_compute -#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */ diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h index 7ba52a16b7..fd1ee54597 100644 --- a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h +++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h @@ -45,14 +45,7 @@ namespace detail inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) { ARM_COMPUTE_UNUSED(weights_offset); - const float32x4x3_t r = - { - { - vld1q_dup_f32(ptr), - vld1q_dup_f32(1 + ptr), - vld1q_dup_f32(2 + ptr) - } - }; + const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}}; return r; } @@ -63,21 +56,16 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) * * @return The loaded matrix. */ -template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same::value || std::is_same::value) > +template ::value || std::is_same::value)> inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0) { const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset); /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ - int32x4x3_t r = - { - { - vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)), - vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))), - vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2))) - } - }; + int32x4x3_t r = {{vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))}}; return r; } @@ -245,36 +233,23 @@ inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values * @param[in] input_offset (Optional) Input quantization offset. * */ -inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - const size_t dilation_x, int input_offset) +inline float32x4_t single_convolve_3x3_dilation(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + const size_t dilation_x, + int input_offset) { ARM_COMPUTE_UNUSED(input_offset); - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + dilation_x), - vld1q_f32(in_top + 2 * dilation_x) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + dilation_x), - vld1q_f32(in_mid + 2 * dilation_x) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + dilation_x), - vld1q_f32(in_low + 2 * dilation_x) - } - }; + const float32x4x3_t vtop = { + {vld1q_f32(in_top), vld1q_f32(in_top + dilation_x), vld1q_f32(in_top + 2 * dilation_x)}}; + const float32x4x3_t vmid = { + {vld1q_f32(in_mid), vld1q_f32(in_mid + dilation_x), vld1q_f32(in_mid + 2 * dilation_x)}}; + const float32x4x3_t vlow = { + {vld1q_f32(in_low), vld1q_f32(in_low + dilation_x), vld1q_f32(in_low + 2 * dilation_x)}}; float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]); out = vmlaq_f32(out, vtop.val[1], m0.val[1]); out = vmlaq_f32(out, vtop.val[2], m0.val[2]); @@ -303,26 +278,28 @@ inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float * @param[in] input_offset (Optional) Input quantization offset. * */ -inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset = 0) +inline float32x4x2_t convolve_3x3_dilation(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + const size_t dilation_x, + unsigned int stridex, + int input_offset = 0) { ARM_COMPUTE_ERROR_ON(stridex > 3); - float32x4x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) - } - }; + float32x4x2_t out = { + {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}}; - if(stridex == 2) + if (stridex == 2) { out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); } @@ -344,26 +321,32 @@ inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_ * */ template -void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - unsigned int stridex, int input_offset = 0); +void convolve_3x3(const float *in_top, + const float *in_mid, + const float *in_low, + float *out_ptr, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + unsigned int stridex, + int input_offset = 0); template -inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - unsigned int stridex, int input_offset) +inline void convolve_3x3(const float *in_top, + const float *in_mid, + const float *in_low, + float *out_ptr, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + unsigned int stridex, + int input_offset) { ARM_COMPUTE_UNUSED(input_offset); ARM_COMPUTE_ERROR_ON(stridex > 3); - float32x4x2_t out = - { - { - vdupq_n_f32(0.f), - vdupq_n_f32(0.f) - } - }; - if(stridex == 2) + float32x4x2_t out = {{vdupq_n_f32(0.f), vdupq_n_f32(0.f)}}; + if (stridex == 2) { const float32x4x2_t vtop = vld2q_f32(in_top); const float32x4x2_t vmid = vld2q_f32(in_mid); @@ -389,32 +372,11 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float * } else { - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + 4), - vld1q_f32(in_top + 8) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + 4), - vld1q_f32(in_mid + 8) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + 4), - vld1q_f32(in_low + 8) - } - }; - out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); - out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]); + const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}}; + const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}}; + const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}}; + out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); + out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]); out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); @@ -438,7 +400,7 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float * out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); - if(stridex == 3) + if (stridex == 3) { out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); @@ -462,65 +424,43 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float * * @param[in] input_offset Input quantization offset. * */ -template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same::value || std::is_same::value) > -inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, - const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - size_t dilation_x, int32_t input_offset) +template ::value || std::is_same::value)> +inline int32x4_t single_convolve_3x3_dilation(const T *in_top, + const T *in_mid, + const T *in_low, + const int32x4x3_t &m0, + const int32x4x3_t &m1, + const int32x4x3_t &m2, + size_t dilation_x, + int32_t input_offset) { using VectorType = typename std::conditional::value, uint8x8x3_t, int8x8x3_t>::type; using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t; const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); - const VectorType vtop = - { - { - wrapper::vload(in_top), - wrapper::vload(in_top + dilation_x), - wrapper::vload(in_top + 2 * dilation_x) - } - }; - const VectorType vmid = - { - { - wrapper::vload(in_mid), - wrapper::vload(in_mid + dilation_x), - wrapper::vload(in_mid + 2 * dilation_x) - } - }; - const VectorType vlow = - { - { - wrapper::vload(in_low), - wrapper::vload(in_low + dilation_x), - wrapper::vload(in_low + 2 * dilation_x) - } - }; - - const int32x4x3_t vtop_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))), - } - }; - const int32x4x3_t vmid_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))), - } - }; - const int32x4x3_t vlow_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))), - } - }; + const VectorType vtop = { + {wrapper::vload(in_top), wrapper::vload(in_top + dilation_x), wrapper::vload(in_top + 2 * dilation_x)}}; + const VectorType vmid = { + {wrapper::vload(in_mid), wrapper::vload(in_mid + dilation_x), wrapper::vload(in_mid + 2 * dilation_x)}}; + const VectorType vlow = { + {wrapper::vload(in_low), wrapper::vload(in_low + dilation_x), wrapper::vload(in_low + 2 * dilation_x)}}; + + const int32x4x3_t vtop_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))), + }}; + const int32x4x3_t vmid_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))), + }}; + const int32x4x3_t vlow_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))), + }}; int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]); out = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]); @@ -550,26 +490,29 @@ inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, * @param[in] input_offset Input quantization offset. * */ -template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same::value || std::is_same::value) > -inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset) +template ::value || std::is_same::value)> +inline int32x4x2_t convolve_3x3_dilation(const T *in_top, + const T *in_mid, + const T *in_low, + const int32x4x3_t &m0, + const int32x4x3_t &m1, + const int32x4x3_t &m2, + const size_t dilation_x, + unsigned int stridex, + int input_offset) { ARM_COMPUTE_ERROR_ON(stridex > 3); - int32x4x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) - } - }; + int32x4x2_t out = { + {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}}; - if(stridex == 2) + if (stridex == 2) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); } @@ -589,10 +532,19 @@ inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const * @param[in] input_offset Input quantization offset. * */ -template < bool accumulate, typename T1, typename T2, ARM_COMPUTE_REQUIRES_TA(std::is_same::value || std::is_same::value) > -void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr, - const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - unsigned int stridex, int32_t input_offset) +template ::value || std::is_same::value)> +void convolve_3x3(const T1 *in_top, + const T1 *in_mid, + const T1 *in_low, + T2 *out_ptr, + const int32x4x3_t &m0, + const int32x4x3_t &m1, + const int32x4x3_t &m2, + unsigned int stridex, + int32_t input_offset) { ARM_COMPUTE_ERROR_ON(stridex > 3); using VectorType = typename std::conditional::value, uint8x8x2_t, int8x8x2_t>::type; @@ -600,60 +552,30 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); - const VectorType vtop = - { - { - wrapper::vload(in_top), - wrapper::vload(in_top + 8) - } - }; - const VectorType vmid = - { - { - wrapper::vload(in_mid), - wrapper::vload(in_mid + 8) - } - }; - const VectorType vlow = - { - { - wrapper::vload(in_low), - wrapper::vload(in_low + 8) - } - }; - - const int32x4x3_t vtop_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), - } - }; - const int32x4x3_t vmid_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), - } - }; - const int32x4x3_t vlow_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), - } - }; - - int32x4x2_t out - { - { - wrapper::vdup_n(static_cast(0), OutputTagType{}), - wrapper::vdup_n(static_cast(0), OutputTagType{}), - } - }; + const VectorType vtop = {{wrapper::vload(in_top), wrapper::vload(in_top + 8)}}; + const VectorType vmid = {{wrapper::vload(in_mid), wrapper::vload(in_mid + 8)}}; + const VectorType vlow = {{wrapper::vload(in_low), wrapper::vload(in_low + 8)}}; + + const int32x4x3_t vtop_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), + }}; + const int32x4x3_t vmid_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), + }}; + const int32x4x3_t vlow_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), + }}; + + int32x4x2_t out{{ + wrapper::vdup_n(static_cast(0), OutputTagType{}), + wrapper::vdup_n(static_cast(0), OutputTagType{}), + }}; // 0 out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]); @@ -681,11 +603,11 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]); out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]); - if(stridex == 1) + if (stridex == 1) { accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); } - else if(stridex == 2) + else if (stridex == 2) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); @@ -693,7 +615,7 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); @@ -712,14 +634,7 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = ARM_COMPUTE_UNUSED(weights_offset); /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ - const float16x8x3_t r = - { - { - vld1q_dup_f16(ptr), - vld1q_dup_f16(1 + ptr), - vld1q_dup_f16(2 + ptr) - } - }; + const float16x8x3_t r = {{vld1q_dup_f16(ptr), vld1q_dup_f16(1 + ptr), vld1q_dup_f16(2 + ptr)}}; return r; } @@ -735,35 +650,22 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = * @param[in] input_offset (Optional)Input quantization offset. * */ -inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - const size_t dilation_x, int input_offset = 0) +inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, + const float16_t *in_mid, + const float16_t *in_low, + const float16x8x3_t &m0, + const float16x8x3_t &m1, + const float16x8x3_t &m2, + const size_t dilation_x, + int input_offset = 0) { ARM_COMPUTE_UNUSED(input_offset); - const float16x8x3_t vtop = - { - { - vld1q_f16(in_top), - vld1q_f16(in_top + dilation_x), - vld1q_f16(in_top + 2 * dilation_x) - } - }; - const float16x8x3_t vmid = - { - { - vld1q_f16(in_mid), - vld1q_f16(in_mid + dilation_x), - vld1q_f16(in_mid + 2 * dilation_x) - } - }; - const float16x8x3_t vlow = - { - { - vld1q_f16(in_low), - vld1q_f16(in_low + dilation_x), - vld1q_f16(in_low + 2 * dilation_x) - } - }; + const float16x8x3_t vtop = { + {vld1q_f16(in_top), vld1q_f16(in_top + dilation_x), vld1q_f16(in_top + 2 * dilation_x)}}; + const float16x8x3_t vmid = { + {vld1q_f16(in_mid), vld1q_f16(in_mid + dilation_x), vld1q_f16(in_mid + 2 * dilation_x)}}; + const float16x8x3_t vlow = { + {vld1q_f16(in_low), vld1q_f16(in_low + dilation_x), vld1q_f16(in_low + 2 * dilation_x)}}; float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]); out = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1])); out = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2])); @@ -792,19 +694,21 @@ inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const f * @param[in] input_offset (Optional) Input quantization offset. * */ -inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset = 0) -{ - float16x8x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset) - } - }; - - if(stridex == 2) +inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, + const float16_t *in_mid, + const float16_t *in_low, + const float16x8x3_t &m0, + const float16x8x3_t &m1, + const float16x8x3_t &m2, + const size_t dilation_x, + unsigned int stridex, + int input_offset = 0) +{ + float16x8x2_t out = { + {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)}}; + + if (stridex == 2) { out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2); @@ -814,7 +718,7 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1 out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); @@ -838,20 +742,20 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1 * */ template -inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - unsigned int stridex, int input_offset = 0) +inline void convolve_3x3(const float16_t *in_top, + const float16_t *in_mid, + const float16_t *in_low, + float16_t *out_ptr, + const float16x8x3_t &m0, + const float16x8x3_t &m1, + const float16x8x3_t &m2, + unsigned int stridex, + int input_offset = 0) { ARM_COMPUTE_UNUSED(input_offset); - float16x8x2_t out = - { - { - vdupq_n_f16(0), - vdupq_n_f16(0) - } - }; - if(stridex == 2) + float16x8x2_t out = {{vdupq_n_f16(0), vdupq_n_f16(0)}}; + if (stridex == 2) { const float16x8x2_t vtop = vld2q_f16(in_top); const float16x8x2_t vmid = vld2q_f16(in_mid); @@ -877,32 +781,11 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const } else { - const float16x8x3_t vtop = - { - { - vld1q_f16(in_top), - vld1q_f16(in_top + 8), - vld1q_f16(in_top + 16) - } - }; - const float16x8x3_t vmid = - { - { - vld1q_f16(in_mid), - vld1q_f16(in_mid + 8), - vld1q_f16(in_mid + 16) - } - }; - const float16x8x3_t vlow = - { - { - vld1q_f16(in_low), - vld1q_f16(in_low + 8), - vld1q_f16(in_low + 16) - } - }; - out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); - out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]); + const float16x8x3_t vtop = {{vld1q_f16(in_top), vld1q_f16(in_top + 8), vld1q_f16(in_top + 16)}}; + const float16x8x3_t vmid = {{vld1q_f16(in_mid), vld1q_f16(in_mid + 8), vld1q_f16(in_mid + 16)}}; + const float16x8x3_t vlow = {{vld1q_f16(in_low), vld1q_f16(in_low + 8), vld1q_f16(in_low + 16)}}; + out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); + out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]); out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1])); out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2])); @@ -921,7 +804,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1])); out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2])); - if(stridex == 3) + if (stridex == 3) { out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); @@ -946,7 +829,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const */ inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex) { - switch(stridex) + switch (stridex) { case 1: return num_elems_written_per_iteration; @@ -959,6 +842,6 @@ inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iter return 0; } } -} +} // namespace detail } // namespace arm_compute #endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */ diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h index 1c77a9e9f0..381de2284a 100644 --- a/src/core/NEON/wrapper/intrinsics/cvt.h +++ b/src/core/NEON/wrapper/intrinsics/cvt.h @@ -30,12 +30,11 @@ namespace arm_compute { namespace wrapper { -#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ - template \ - inline typename std::enable_if::value, float32x4_t>::type \ - vcvt(const vtype &a) \ - { \ - return prefix##_##postfix1##_##postfix2(a); \ +#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ + template \ + inline typename std::enable_if::value, float32x4_t>::type vcvt(const vtype &a) \ + { \ + return prefix##_##postfix1##_##postfix2(a); \ } VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32) @@ -46,12 +45,11 @@ VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16) #undef VCVT_TO_F32_IMPL #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ - template \ - inline typename std::enable_if::value, float16x4_t>::type \ - vcvt(const vtype &a) \ - { \ - return prefix##_##postfix1##_##postfix2(a); \ +#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ + template \ + inline typename std::enable_if::value, float16x4_t>::type vcvt(const vtype &a) \ + { \ + return prefix##_##postfix1##_##postfix2(a); \ } VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32) @@ -59,14 +57,14 @@ VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template -inline typename std::enable_if < std::is_same::value || std::is_same::value, uint32x4_t >::type +inline typename std::enable_if::value || std::is_same::value, uint32x4_t>::type vcvt(const float32x4_t &a) { return vcvtq_u32_f32(a); } template -inline typename std::enable_if < std::is_same::value || std::is_same::value, int32x4_t >::type +inline typename std::enable_if::value || std::is_same::value, int32x4_t>::type vcvt(const float32x4_t &a) { return vcvtq_s32_f32(a); @@ -74,15 +72,13 @@ vcvt(const float32x4_t &a) #ifdef __aarch64__ template -inline typename std::enable_if::value, uint32x4_t>::type -vcvta(const float32x4_t &a) +inline typename std::enable_if::value, uint32x4_t>::type vcvta(const float32x4_t &a) { return vcvtaq_u32_f32(a); } template -inline typename std::enable_if::value, int32x4_t>::type -vcvta(const float32x4_t &a) +inline typename std::enable_if::value, int32x4_t>::type vcvta(const float32x4_t &a) { return vcvtaq_s32_f32(a); } @@ -96,14 +92,13 @@ vcvta(const float32x4_t &a) */ inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr) { - __asm __volatile( - "ldp q0, q1, [%[inptr]]\n" - ".inst 0xea16800\n" // BFCVTN v0, v0 - ".inst 0x4ea16820\n" // BFCVTN2 v0, v1 - "str q0, [%[outptr]]\n" - : [inptr] "+r"(inptr) - : [outptr] "r"(outptr) - : "v0", "v1", "memory"); + __asm __volatile("ldp q0, q1, [%[inptr]]\n" + ".inst 0xea16800\n" // BFCVTN v0, v0 + ".inst 0x4ea16820\n" // BFCVTN2 v0, v1 + "str q0, [%[outptr]]\n" + : [inptr] "+r"(inptr) + : [outptr] "r"(outptr) + : "v0", "v1", "memory"); } #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ diff --git a/src/core/NEON/wrapper/intrinsics/div.h b/src/core/NEON/wrapper/intrinsics/div.h index 265f30d33b..ece991a5b0 100644 --- a/src/core/NEON/wrapper/intrinsics/div.h +++ b/src/core/NEON/wrapper/intrinsics/div.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_DIV_H #include "src/core/NEON/NEMath.h" + #include namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/erf.h b/src/core/NEON/wrapper/intrinsics/erf.h index e2207648e5..0e34462b96 100644 --- a/src/core/NEON/wrapper/intrinsics/erf.h +++ b/src/core/NEON/wrapper/intrinsics/erf.h @@ -26,6 +26,7 @@ #define ARM_COMPUTE_WRAPPER_ERF_H #include "src/core/NEON/NEMath.h" + #include namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/exp.h b/src/core/NEON/wrapper/intrinsics/exp.h index c2a6970967..f44577b926 100644 --- a/src/core/NEON/wrapper/intrinsics/exp.h +++ b/src/core/NEON/wrapper/intrinsics/exp.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_EXP_H #include "src/core/NEON/NEMath.h" + #include namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/getlane.h b/src/core/NEON/wrapper/intrinsics/getlane.h index 2052751612..ae813bb2fa 100644 --- a/src/core/NEON/wrapper/intrinsics/getlane.h +++ b/src/core/NEON/wrapper/intrinsics/getlane.h @@ -33,7 +33,7 @@ namespace wrapper #define VGETLANE_IMPL_8(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vget_lane_##postfix(vector, 0); \ @@ -59,7 +59,7 @@ namespace wrapper #define VGETLANE_IMPL_4(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vget_lane_##postfix(vector, 0); \ @@ -77,7 +77,7 @@ namespace wrapper #define VGETLANE_IMPL_2(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vget_lane_##postfix(vector, 0); \ @@ -102,7 +102,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_16(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ @@ -144,7 +144,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_8(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ @@ -170,7 +170,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_4(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ @@ -188,7 +188,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_2(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ diff --git a/src/core/NEON/wrapper/intrinsics/inv.h b/src/core/NEON/wrapper/intrinsics/inv.h index de398b0403..e443be679b 100644 --- a/src/core/NEON/wrapper/intrinsics/inv.h +++ b/src/core/NEON/wrapper/intrinsics/inv.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_INV_H #include "src/core/NEON/NEMath.h" + #include namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/invsqrt.h b/src/core/NEON/wrapper/intrinsics/invsqrt.h index 2343efa8f8..257b445cc7 100644 --- a/src/core/NEON/wrapper/intrinsics/invsqrt.h +++ b/src/core/NEON/wrapper/intrinsics/invsqrt.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_INVSQRT_H #include "src/core/NEON/NEMath.h" + #include namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/log.h b/src/core/NEON/wrapper/intrinsics/log.h index 357a77ca78..d091407edb 100644 --- a/src/core/NEON/wrapper/intrinsics/log.h +++ b/src/core/NEON/wrapper/intrinsics/log.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_LOG_H #include "src/core/NEON/NEMath.h" + #include namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/pow.h b/src/core/NEON/wrapper/intrinsics/pow.h index 61f834ed23..dfd6ccc358 100644 --- a/src/core/NEON/wrapper/intrinsics/pow.h +++ b/src/core/NEON/wrapper/intrinsics/pow.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_POW_H #include "src/core/NEON/NEMath.h" + #include namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/qmov.h b/src/core/NEON/wrapper/intrinsics/qmov.h index 167f3cf43b..9a0a23a241 100644 --- a/src/core/NEON/wrapper/intrinsics/qmov.h +++ b/src/core/NEON/wrapper/intrinsics/qmov.h @@ -31,15 +31,13 @@ namespace arm_compute namespace wrapper { template -inline typename std::enable_if::value, uint8x8_t>::type -vqmov(const int16x8_t &a) +inline typename std::enable_if::value, uint8x8_t>::type vqmov(const int16x8_t &a) { return vqmovun_s16(a); } template -inline typename std::enable_if::value, int8x8_t>::type -vqmov(const int16x8_t &a) +inline typename std::enable_if::value, int8x8_t>::type vqmov(const int16x8_t &a) { return vqmovn_s16(a); } diff --git a/src/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h index cf00a4aceb..c2c4f720d2 100644 --- a/src/core/NEON/wrapper/intrinsics/reinterpret.h +++ b/src/core/NEON/wrapper/intrinsics/reinterpret.h @@ -35,7 +35,7 @@ namespace wrapper { \ return prefix##_##postfix1##_##postfix2(a); \ } \ - \ + \ inline ptype vreinterpret(const ptype &a) \ { \ return a; \ diff --git a/src/core/NEON/wrapper/intrinsics/round.h b/src/core/NEON/wrapper/intrinsics/round.h index d23feb6b42..7789aab770 100644 --- a/src/core/NEON/wrapper/intrinsics/round.h +++ b/src/core/NEON/wrapper/intrinsics/round.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_ROUND_H #include "src/core/NEON/NEMath.h" + #include namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/setlane.h b/src/core/NEON/wrapper/intrinsics/setlane.h index 197eedacb5..259b8eaf90 100644 --- a/src/core/NEON/wrapper/intrinsics/setlane.h +++ b/src/core/NEON/wrapper/intrinsics/setlane.h @@ -33,7 +33,7 @@ namespace wrapper #define VSETLANE_IMPL_8(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vset_lane_##postfix(value, vector, 0); \ @@ -59,7 +59,7 @@ namespace wrapper #define VSETLANE_IMPL_4(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vset_lane_##postfix(value, vector, 0); \ @@ -77,7 +77,7 @@ namespace wrapper #define VSETLANE_IMPL_2(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vset_lane_##postfix(value, vector, 0); \ @@ -102,7 +102,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16) #define VSETQLANE_IMPL_16(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vsetq_lane_##postfix(value, vector, 0); \ @@ -144,7 +144,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16) #define VSETQLANE_IMPL_8(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vsetq_lane_##postfix(value, vector, 0); \ @@ -170,7 +170,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16) #define VSETQLANE_IMPL_4(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vsetq_lane_##postfix(value, vector, 0); \ diff --git a/src/core/NEON/wrapper/intrinsics/shr.h b/src/core/NEON/wrapper/intrinsics/shr.h index 73ca9c56c6..6ccb9cdf92 100644 --- a/src/core/NEON/wrapper/intrinsics/shr.h +++ b/src/core/NEON/wrapper/intrinsics/shr.h @@ -75,7 +75,7 @@ VQRSHRN_SCALAR_IMPL(uint32_t, uint64_t, vqrshrnd_n, u64) { \ return prefix_signed##_##postfix(a, b); \ } \ - \ + \ template \ inline typename std::enable_if::value && !std::is_signed::value, u##half_vtype>::type \ vqrshrn_ex(const vtype &a) \ @@ -128,7 +128,7 @@ VSHRQ_SCALAR_IMPL(int32_t, vshrd_n, s64) { \ return prefix_signed##_##postfix(a, b); \ } \ - \ + \ template \ inline typename std::enable_if::value && !std::is_signed::value, u##half_vtype>::type \ vqrshrn_ex(const vtype &a) \ diff --git a/src/core/NEON/wrapper/intrinsics/sin.h b/src/core/NEON/wrapper/intrinsics/sin.h index 03c2813a32..d24fdfa816 100644 --- a/src/core/NEON/wrapper/intrinsics/sin.h +++ b/src/core/NEON/wrapper/intrinsics/sin.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_SIN_H #include "src/core/NEON/NEMath.h" + #include namespace arm_compute @@ -54,4 +55,4 @@ VSIN_IMPL_INT(int32x4_t, vsinq, s32) #undef vsub_IMPL } // namespace wrapper } // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_SUB_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_WRAPPER_SUB_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svcnt.h b/src/core/NEON/wrapper/intrinsics/svcnt.h index e530e7c83f..c4652504b4 100644 --- a/src/core/NEON/wrapper/intrinsics/svcnt.h +++ b/src/core/NEON/wrapper/intrinsics/svcnt.h @@ -30,7 +30,7 @@ namespace arm_compute namespace wrapper { template -inline uint64_t svcnt_size(); +inline uint64_t svcnt_size(); template <> inline uint64_t svcnt_size<64>() @@ -65,4 +65,4 @@ inline uint64_t svcnt() } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */ \ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svcvt.h b/src/core/NEON/wrapper/intrinsics/svcvt.h index 746b004d7d..00ef7b7eb3 100644 --- a/src/core/NEON/wrapper/intrinsics/svcvt.h +++ b/src/core/NEON/wrapper/intrinsics/svcvt.h @@ -29,11 +29,12 @@ namespace arm_compute { namespace wrapper { -#define SVCVT_Z_TO_F32_IMPL(vtype) \ - template \ - inline typename std::enable_if::value, svfloat32_t>::type svcvt_z(svbool_t pg, const vtype &a) \ - { \ - return svcvt_f32_z(pg, a); \ +#define SVCVT_Z_TO_F32_IMPL(vtype) \ + template \ + inline typename std::enable_if::value, svfloat32_t>::type svcvt_z(svbool_t pg, \ + const vtype &a) \ + { \ + return svcvt_f32_z(pg, a); \ } SVCVT_Z_TO_F32_IMPL(svuint32_t) @@ -42,11 +43,12 @@ SVCVT_Z_TO_F32_IMPL(svfloat16_t) #undef SVCVT_Z_TO_F32_IMPL -#define SVCVT_Z_TO_F16_IMPL(vtype) \ - template \ - inline typename std::enable_if::value, svfloat16_t>::type svcvt_z(svbool_t pg, const vtype &a) \ - { \ - return svcvt_f16_z(pg, a); \ +#define SVCVT_Z_TO_F16_IMPL(vtype) \ + template \ + inline typename std::enable_if::value, svfloat16_t>::type svcvt_z(svbool_t pg, \ + const vtype &a) \ + { \ + return svcvt_f16_z(pg, a); \ } SVCVT_Z_TO_F16_IMPL(svuint32_t) @@ -55,11 +57,12 @@ SVCVT_Z_TO_F16_IMPL(svfloat32_t) #undef SVCVT_Z_TO_F16_IMPL -#define SVCVT_Z_TO_S32_IMPL(vtype) \ - template \ - inline typename std::enable_if::value, svint32_t>::type svcvt_z(svbool_t pg, const vtype &a) \ - { \ - return svcvt_s32_z(pg, a); \ +#define SVCVT_Z_TO_S32_IMPL(vtype) \ + template \ + inline typename std::enable_if::value, svint32_t>::type svcvt_z(svbool_t pg, \ + const vtype &a) \ + { \ + return svcvt_s32_z(pg, a); \ } SVCVT_Z_TO_S32_IMPL(svfloat16_t) @@ -71,4 +74,4 @@ SVCVT_Z_TO_S32_IMPL(svfloat32_t) } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */ \ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svexp.h b/src/core/NEON/wrapper/intrinsics/svexp.h index d6ce9a77d1..1e8bce3960 100644 --- a/src/core/NEON/wrapper/intrinsics/svexp.h +++ b/src/core/NEON/wrapper/intrinsics/svexp.h @@ -26,6 +26,7 @@ #if defined(__ARM_FEATURE_SVE) #include "src/core/NEON/SVEMath.h" + #include namespace arm_compute @@ -46,4 +47,4 @@ SVEXP_IMPL(svfloat16_t, f16) } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */ \ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svlog.h b/src/core/NEON/wrapper/intrinsics/svlog.h index 5b505ae1e3..b4630e20ed 100644 --- a/src/core/NEON/wrapper/intrinsics/svlog.h +++ b/src/core/NEON/wrapper/intrinsics/svlog.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H #if defined(__ARM_FEATURE_SVE) #include "src/core/NEON/SVEMath.h" + #include namespace arm_compute @@ -44,4 +45,4 @@ SVLOG_IMPL(svfloat16_t, f16) } // namespace wrapper } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */ \ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svptrue.h b/src/core/NEON/wrapper/intrinsics/svptrue.h index 53407e5301..6ed00bccbf 100644 --- a/src/core/NEON/wrapper/intrinsics/svptrue.h +++ b/src/core/NEON/wrapper/intrinsics/svptrue.h @@ -30,7 +30,7 @@ namespace arm_compute namespace wrapper { template -inline svbool_t svptrue_size(); +inline svbool_t svptrue_size(); template <> inline svbool_t svptrue_size<64>() @@ -65,4 +65,4 @@ svbool_t svptrue() } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */ \ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svwhilelt.h b/src/core/NEON/wrapper/intrinsics/svwhilelt.h index ef58217dc4..f0f84a9508 100644 --- a/src/core/NEON/wrapper/intrinsics/svwhilelt.h +++ b/src/core/NEON/wrapper/intrinsics/svwhilelt.h @@ -32,7 +32,7 @@ namespace wrapper #define SVWHILELT_IMPL(type) \ template \ inline svbool_t svwhilelt_size(type a, type b); \ - \ + \ template <> \ inline svbool_t svwhilelt_size<64>(type a, type b) \ { \ @@ -70,4 +70,4 @@ inline svbool_t svwhilelt(IndexType a, IndexType b) } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */ \ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/tanh.h b/src/core/NEON/wrapper/intrinsics/tanh.h index daeaf19997..e74f0e86fe 100644 --- a/src/core/NEON/wrapper/intrinsics/tanh.h +++ b/src/core/NEON/wrapper/intrinsics/tanh.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_TANH_H #include "src/core/NEON/NEMath.h" + #include namespace arm_compute diff --git a/src/core/NEON/wrapper/scalar/add.h b/src/core/NEON/wrapper/scalar/add.h index 642d9261f3..2ec88869e3 100644 --- a/src/core/NEON/wrapper/scalar/add.h +++ b/src/core/NEON/wrapper/scalar/add.h @@ -32,22 +32,22 @@ namespace wrapper { inline uint8_t add_sat(const uint8_t &a, const uint8_t &b) { - const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 }; - const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 }; + const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0}; + const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0}; return vget_lane_u8(vqadd_u8(va, vb), 0); } inline int16_t add_sat(const int16_t &a, const int16_t &b) { - const int16x4_t va = { a, 0, 0, 0 }; - const int16x4_t vb = { b, 0, 0, 0 }; + const int16x4_t va = {a, 0, 0, 0}; + const int16x4_t vb = {b, 0, 0, 0}; return vget_lane_s16(vqadd_s16(va, vb), 0); } inline int32_t add_sat(const int32_t &a, const int32_t &b) { - const int32x2_t va = { a, 0 }; - const int32x2_t vb = { b, 0 }; + const int32x2_t va = {a, 0}; + const int32x2_t vb = {b, 0}; return vget_lane_s32(vqadd_s32(va, vb), 0); } diff --git a/src/core/NEON/wrapper/scalar/sub.h b/src/core/NEON/wrapper/scalar/sub.h index 1fe51d75fc..00de7d867f 100644 --- a/src/core/NEON/wrapper/scalar/sub.h +++ b/src/core/NEON/wrapper/scalar/sub.h @@ -32,22 +32,22 @@ namespace wrapper { inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b) { - const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 }; - const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 }; + const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0}; + const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0}; return vget_lane_u8(vqsub_u8(va, vb), 0); } inline int16_t sub_sat(const int16_t &a, const int16_t &b) { - const int16x4_t va = { a, 0, 0, 0 }; - const int16x4_t vb = { b, 0, 0, 0 }; + const int16x4_t va = {a, 0, 0, 0}; + const int16x4_t vb = {b, 0, 0, 0}; return vget_lane_s16(vqsub_s16(va, vb), 0); } inline int32_t sub_sat(const int32_t &a, const int32_t &b) { - const int32x2_t va = { a, 0 }; - const int32x2_t vb = { b, 0 }; + const int32x2_t va = {a, 0}; + const int32x2_t vb = {b, 0}; return vget_lane_s32(vqsub_s32(va, vb), 0); } diff --git a/src/core/NEON/wrapper/svtraits.h b/src/core/NEON/wrapper/svtraits.h index 5ccd0ba8f1..330d272752 100644 --- a/src/core/NEON/wrapper/svtraits.h +++ b/src/core/NEON/wrapper/svtraits.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_WRAPPER_SVTRAITS_H #if defined(ARM_COMPUTE_ENABLE_SVE) #include "src/core/NEON/SVEMath.h" + #include namespace arm_compute diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp index 99858e2a98..62ce335815 100644 --- a/src/core/Rounding.cpp +++ b/src/core/Rounding.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Rounding.h" #include "arm_compute/core/Error.h" + #include "support/ToolchainSupport.h" #include @@ -36,7 +37,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy) { using namespace std; int rounded = 0; - switch(rounding_policy) + switch (rounding_policy) { case RoundingPolicy::TO_ZERO: { @@ -51,9 +52,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy) case RoundingPolicy::TO_NEAREST_EVEN: { #ifdef __aarch64__ - asm("fcvtns %x[res], %s[value]" - : [res] "=r"(rounded) - : [value] "w"(x)); + asm("fcvtns %x[res], %s[value]" : [res] "=r"(rounded) : [value] "w"(x)); #else // __aarch64__ ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported."); #endif // __aarch64__ diff --git a/src/core/Size2D.cpp b/src/core/Size2D.cpp index 6eb46e56af..69b2651520 100644 --- a/src/core/Size2D.cpp +++ b/src/core/Size2D.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/Size2D.h" + #include "support/StringSupport.h" namespace arm_compute @@ -30,4 +31,4 @@ std::string Size2D::to_string() const { return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height); } -} +} // namespace arm_compute diff --git a/src/core/Size3D.cpp b/src/core/Size3D.cpp index 3ee9fb8e5c..b56a99acd7 100644 --- a/src/core/Size3D.cpp +++ b/src/core/Size3D.cpp @@ -22,12 +22,14 @@ * SOFTWARE. */ #include "arm_compute/core/Size3D.h" + #include "support/StringSupport.h" namespace arm_compute { std::string Size3D::to_string() const { - return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") + support::cpp11::to_string(depth); + return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") + + support::cpp11::to_string(depth); } -} \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp index 723b6bc016..8012c3d721 100644 --- a/src/core/SubTensorInfo.cpp +++ b/src/core/SubTensorInfo.cpp @@ -42,10 +42,10 @@ namespace TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coordinates coords) { // Extend shape - for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i) + for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i) { int dimension_extend = coords[i] + static_cast(shape[i]); - if((dimension_extend > static_cast(parent_shape[i])) && (dimension_extend > 0)) + if ((dimension_extend > static_cast(parent_shape[i])) && (dimension_extend > 0)) { parent_shape.set(i, static_cast(dimension_extend)); } @@ -56,23 +56,35 @@ TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coo } // namespace SubTensorInfo::SubTensorInfo() - : _parent(nullptr), _tensor_shape(), _dims_state(), _coords(), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(false), _lock_paddings(false) + : _parent(nullptr), + _tensor_shape(), + _dims_state(), + _coords(), + _valid_region{Coordinates(), _tensor_shape}, + _extend_parent(false), + _lock_paddings(false) { } SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent) - : _parent(parent), _tensor_shape(tensor_shape), _dims_state(), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(extend_parent), _lock_paddings(false) + : _parent(parent), + _tensor_shape(tensor_shape), + _dims_state(), + _coords(coords), + _valid_region{Coordinates(), _tensor_shape}, + _extend_parent(extend_parent), + _lock_paddings(false) { ARM_COMPUTE_ERROR_ON(parent == nullptr); // Check if subtensor is valid if parent is configured - if(parent->tensor_shape().total_size() != 0 && !_extend_parent) + if (parent->tensor_shape().total_size() != 0 && !_extend_parent) { ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape); } // Initialize valid region - _valid_region = ValidRegion{ Coordinates(), _tensor_shape }; + _valid_region = ValidRegion{Coordinates(), _tensor_shape}; } std::unique_ptr SubTensorInfo::clone() const @@ -91,17 +103,17 @@ ITensorInfo &SubTensorInfo::set_tensor_shape(const TensorShape &shape) ARM_COMPUTE_ERROR_ON(_parent == nullptr); // Check if subtensor is valid if parent is configured - if(_parent->tensor_shape().total_size() != 0 && !_extend_parent) + if (_parent->tensor_shape().total_size() != 0 && !_extend_parent) { ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape); - _valid_region = ValidRegion{ _coords, shape }; + _valid_region = ValidRegion{_coords, shape}; } - else if(_extend_parent) // Extend parent shape, configure if specified + else if (_extend_parent) // Extend parent shape, configure if specified { ARM_COMPUTE_ERROR_ON((_parent->data_type() == DataType::UNKNOWN) && (_parent->format() == Format::UNKNOWN)); TensorShape parent_extended_shape = extend_parent_shape(_parent->tensor_shape(), shape, _coords); _parent->set_tensor_shape(parent_extended_shape); - _parent->set_valid_region(ValidRegion{ Coordinates(), parent_extended_shape }); + _parent->set_valid_region(ValidRegion{Coordinates(), parent_extended_shape}); } _tensor_shape = shape; return *this; @@ -133,11 +145,11 @@ bool SubTensorInfo::extend_padding(const PaddingSize &padding) ARM_COMPUTE_ERROR_ON(_parent->total_size() == 0); // Check that you do not extend padding on sub-tensors unless XY shape matches parent tensor - if(!_extend_parent && (padding.left || padding.right)) + if (!_extend_parent && (padding.left || padding.right)) { ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().x() != tensor_shape().x()); } - if(!_extend_parent && (padding.top || padding.bottom)) + if (!_extend_parent && (padding.top || padding.bottom)) { ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().y() != tensor_shape().y()); } @@ -153,7 +165,7 @@ int32_t SubTensorInfo::offset_element_in_bytes(const Coordinates &pos) const int32_t offset = offset_first_element_in_bytes(); const Strides &strides = strides_in_bytes(); - for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i) + for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i) { offset += pos[i] * strides[i]; } diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp index 5905ba5215..31bddbde40 100644 --- a/src/core/TensorInfo.cpp +++ b/src/core/TensorInfo.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/Utils.h" #include @@ -34,13 +35,26 @@ namespace arm_compute { TensorInfo::TensorInfo() - : _total_size(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _dims_state(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true }, - _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true), _id(invalid_tensor_id), _lock_paddings(false) -{ -} - -TensorInfo::TensorInfo(const ITensorInfo &info) - : TensorInfo() + : _total_size(0), + _offset_first_element_in_bytes(0), + _strides_in_bytes(), + _num_channels(0), + _tensor_shape(), + _dims_state(), + _data_type(DataType::UNKNOWN), + _format(Format::UNKNOWN), + _is_resizable{true}, + _valid_region{Coordinates(), _tensor_shape}, + _padding{0}, + _quantization_info(), + _data_layout(DataLayout::NCHW), + _are_values_constant(true), + _id(invalid_tensor_id), + _lock_paddings(false) +{ +} + +TensorInfo::TensorInfo(const ITensorInfo &info) : TensorInfo() { _total_size = info.total_size(); _offset_first_element_in_bytes = info.offset_first_element_in_bytes(); @@ -60,8 +74,7 @@ TensorInfo::TensorInfo(const ITensorInfo &info) _lock_paddings = info.lock_paddings(); } -TensorInfo::TensorInfo(const TensorInfo &info) - : TensorInfo() +TensorInfo::TensorInfo(const TensorInfo &info) : TensorInfo() { _total_size = info.total_size(); _offset_first_element_in_bytes = info.offset_first_element_in_bytes(); @@ -80,8 +93,7 @@ TensorInfo::TensorInfo(const TensorInfo &info) _id = info.id(); _lock_paddings = false; } -TensorInfo::TensorInfo(Format format) - : TensorInfo(TensorShape(), format) +TensorInfo::TensorInfo(Format format) : TensorInfo(TensorShape(), format) { } @@ -90,25 +102,25 @@ TensorInfo::TensorInfo(unsigned int width, unsigned int height, Format format) { } -TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format) - : TensorInfo() +TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format) : TensorInfo() { init(tensor_shape, format); } -TensorInfo::TensorInfo(size_t num_channels, DataType data_type) - : TensorInfo() +TensorInfo::TensorInfo(size_t num_channels, DataType data_type) : TensorInfo() { init(TensorShape(), num_channels, data_type); } -TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type) - : TensorInfo() +TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type) : TensorInfo() { init(tensor_shape, num_channels, data_type); } -TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info) +TensorInfo::TensorInfo(const TensorShape &tensor_shape, + size_t num_channels, + DataType data_type, + QuantizationInfo quantization_info) : TensorInfo() { init(tensor_shape, num_channels, data_type); @@ -137,9 +149,11 @@ void TensorInfo::init(const TensorShape &tensor_shape, Format format) _format = format; } -void TensorInfo::init(const TensorShape &tensor_shape, Format format, - const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, - size_t total_size_in_bytes) +void TensorInfo::init(const TensorShape &tensor_shape, + Format format, + const Strides &strides_in_bytes, + size_t offset_first_element_in_bytes, + size_t total_size_in_bytes) { size_t num_channels = num_channels_from_format(format); const DataType type = data_type_from_format(format); @@ -165,9 +179,12 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data set_tensor_shape(tensor_shape); } -void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, - const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, - size_t total_size_in_bytes) +void TensorInfo::init(const TensorShape &tensor_shape, + size_t num_channels, + DataType data_type, + const Strides &strides_in_bytes, + size_t offset_first_element_in_bytes, + size_t total_size_in_bytes) { ARM_COMPUTE_ERROR_ON(num_channels == 0); @@ -179,7 +196,7 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data _strides_in_bytes = strides_in_bytes; _total_size = total_size_in_bytes; - _valid_region = ValidRegion{ Coordinates(), _tensor_shape }; + _valid_region = ValidRegion{Coordinates(), _tensor_shape}; } size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, Format format) @@ -202,7 +219,7 @@ size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num _format = Format::UNKNOWN; _tensor_shape = tensor_shape; - _valid_region = ValidRegion{ Coordinates(), _tensor_shape }; + _valid_region = ValidRegion{Coordinates(), _tensor_shape}; auto_padding(); @@ -233,11 +250,11 @@ std::tuple TensorInfo::calculate_padding_requirements(c size_t required_total_size = 0; const size_t required_offset_first_element = padding.left * stride_x + padding.top * stride_y; - switch(_tensor_shape.num_dimensions()) + switch (_tensor_shape.num_dimensions()) { case 0: { - if(_tensor_shape.total_size() > 0) + if (_tensor_shape.total_size() > 0) { required_strides = Strides(stride_x, stride_x); required_total_size = stride_z; @@ -258,7 +275,8 @@ std::tuple TensorInfo::calculate_padding_requirements(c const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1; - required_total_size = static_cast(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension]; + required_total_size = + static_cast(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension]; break; } } @@ -284,25 +302,25 @@ bool TensorInfo::extend_padding(const PaddingSize &padding) bool updated = false; - if(padding.top > _padding.top) + if (padding.top > _padding.top) { _padding.top = padding.top; updated = true; } - if(padding.right > _padding.right) + if (padding.right > _padding.right) { _padding.right = padding.right; updated = true; } - if(padding.bottom > _padding.bottom) + if (padding.bottom > _padding.bottom) { _padding.bottom = padding.bottom; updated = true; } - if(padding.left > _padding.left) + if (padding.left > _padding.left) { _padding.left = padding.left; updated = true; @@ -336,7 +354,7 @@ ITensorInfo &TensorInfo::set_format(Format format) { _format = format; - if(_data_type == DataType::UNKNOWN) + if (_data_type == DataType::UNKNOWN) { _num_channels = num_channels_from_format(format); _data_type = data_type_from_format(format); @@ -355,19 +373,19 @@ ITensorInfo &TensorInfo::set_tensor_shape(const TensorShape &shape) _offset_first_element_in_bytes = 0; _strides_in_bytes = compute_strides(*this); - if(_tensor_shape.num_dimensions() == 0) + if (_tensor_shape.num_dimensions() == 0) { _total_size = _strides_in_bytes[0]; } else { const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1; - _total_size = static_cast(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension]; + _total_size = static_cast(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension]; } std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding); - _valid_region = ValidRegion{ Coordinates(), _tensor_shape }; + _valid_region = ValidRegion{Coordinates(), _tensor_shape}; return *this; } @@ -392,9 +410,10 @@ ITensorInfo &TensorInfo::set_data_layout(const DataLayout &data_layout) ITensorInfo &TensorInfo::reset_padding() { _padding = PaddingSize(); - if(((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0) + if (((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0) { - std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding); + std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = + calculate_padding_requirements(_padding); } return *this; } @@ -405,7 +424,7 @@ int32_t TensorInfo::offset_element_in_bytes(const Coordinates &pos) const int32_t offset = _offset_first_element_in_bytes; - for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i) + for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i) { offset += pos[i] * _strides_in_bytes[i]; } diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index 1ca7adb3a8..90a7ac32c0 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -49,7 +49,7 @@ std::string read_file(const std::string &filename, bool binary) fs.exceptions(std::ifstream::failbit | std::ifstream::badbit); std::ios_base::openmode mode = std::ios::in; - if(binary) + if (binary) { mode |= std::ios::binary; } @@ -66,7 +66,7 @@ std::string read_file(const std::string &filename, bool binary) out.assign(std::istreambuf_iterator(fs), std::istreambuf_iterator()); #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::ifstream::failure &e) + catch (const std::ifstream::failure &e) { ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", filename.c_str(), e.what()); } @@ -77,32 +77,28 @@ std::string read_file(const std::string &filename, bool binary) const std::string &string_from_channel(Channel channel) { - static std::map channels_map = - { - { Channel::UNKNOWN, "UNKNOWN" }, - { Channel::R, "R" }, - { Channel::G, "G" }, - { Channel::B, "B" }, - { Channel::A, "A" }, - { Channel::Y, "Y" }, - { Channel::U, "U" }, - { Channel::V, "V" }, - { Channel::C0, "C0" }, - { Channel::C1, "C1" }, - { Channel::C2, "C2" }, - { Channel::C3, "C3" } - }; + static std::map channels_map = {{Channel::UNKNOWN, "UNKNOWN"}, + {Channel::R, "R"}, + {Channel::G, "G"}, + {Channel::B, "B"}, + {Channel::A, "A"}, + {Channel::Y, "Y"}, + {Channel::U, "U"}, + {Channel::V, "V"}, + {Channel::C0, "C0"}, + {Channel::C1, "C1"}, + {Channel::C2, "C2"}, + {Channel::C3, "C3"}}; return channels_map[channel]; } const std::string &string_from_border_mode(BorderMode border_mode) { - static std::map border_mode_map = - { - { BorderMode::UNDEFINED, "UNDEFINED" }, - { BorderMode::CONSTANT, "CONSTANT" }, - { BorderMode::REPLICATE, "REPLICATE" }, + static std::map border_mode_map = { + {BorderMode::UNDEFINED, "UNDEFINED"}, + {BorderMode::CONSTANT, "CONSTANT"}, + {BorderMode::REPLICATE, "REPLICATE"}, }; return border_mode_map[border_mode]; @@ -110,11 +106,10 @@ const std::string &string_from_border_mode(BorderMode border_mode) const std::string &string_from_norm_type(NormType type) { - static std::map norm_type_map = - { - { NormType::IN_MAP_1D, "IN_MAP_1D" }, - { NormType::IN_MAP_2D, "IN_MAP_2D" }, - { NormType::CROSS_MAP, "CROSS_MAP" }, + static std::map norm_type_map = { + {NormType::IN_MAP_1D, "IN_MAP_1D"}, + {NormType::IN_MAP_2D, "IN_MAP_2D"}, + {NormType::CROSS_MAP, "CROSS_MAP"}, }; return norm_type_map[type]; @@ -122,11 +117,10 @@ const std::string &string_from_norm_type(NormType type) const std::string &string_from_pooling_type(PoolingType type) { - static std::map pool_type_map = - { - { PoolingType::MAX, "MAX" }, - { PoolingType::AVG, "AVG" }, - { PoolingType::L2, "L2" }, + static std::map pool_type_map = { + {PoolingType::MAX, "MAX"}, + {PoolingType::AVG, "AVG"}, + {PoolingType::L2, "L2"}, }; return pool_type_map[type]; @@ -134,38 +128,36 @@ const std::string &string_from_pooling_type(PoolingType type) bool is_pool_region_entirely_outside_input(const PoolingLayerInfo &info) { - if(info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0) + if (info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0) { return false; } const auto ps = info.pad_stride_info; - const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.pad_left(), ps.pad_right() }); - const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.pad_top(), ps.pad_bottom() }); + const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.pad_left(), ps.pad_right()}); + const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.pad_top(), ps.pad_bottom()}); return pool_le_padding_x || pool_le_padding_y; } bool is_pool_3d_region_entirely_outside_input(const Pooling3dLayerInfo &info) { - if(info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0) + if (info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0) { return false; } const auto ps = info.padding; - const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.left, ps.right }); - const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.top, ps.bottom }); - const auto pool_le_padding_z = info.pool_size.z() <= std::max({ ps.front, ps.back }); + const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.left, ps.right}); + const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.top, ps.bottom}); + const auto pool_le_padding_z = info.pool_size.z() <= std::max({ps.front, ps.back}); return pool_le_padding_x || pool_le_padding_y || pool_le_padding_z; } const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage) { - static std::map output_stage_map = - { - { GEMMLowpOutputStageType::NONE, "" }, - { GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down" }, - { GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint" }, - { GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float" } - }; + static std::map output_stage_map = { + {GEMMLowpOutputStageType::NONE, ""}, + {GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down"}, + {GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint"}, + {GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float"}}; return output_stage_map[output_stage]; } @@ -175,7 +167,7 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data std::stringstream ss; std::string converted_string; - switch(data_type) + switch (data_type) { case DataType::U8: case DataType::QASYMM8: @@ -223,11 +215,16 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data return converted_string; } -PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation, +PadStrideInfo calculate_same_pad(TensorShape input_shape, + TensorShape weights_shape, + PadStrideInfo conv_info, + DataLayout data_layout, + const Size2D &dilation, const DimensionRoundingType &rounding_type) { const auto &strides = conv_info.stride(); - ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1), "Stride values should be greater than or equal to 1."); + ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1), + "Stride values should be greater than or equal to 1."); const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); @@ -246,8 +243,9 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh const int real_weight_height = (kernel_height - 1) * dilation.y() + 1; // Calculate total pad - const int pad_width = std::max(0, static_cast((out_width - 1) * strides.first + real_weight_width - in_width)); - const int pad_height = std::max(0, static_cast((out_height - 1) * strides.second + real_weight_height - in_height)); + const int pad_width = std::max(0, static_cast((out_width - 1) * strides.first + real_weight_width - in_width)); + const int pad_height = + std::max(0, static_cast((out_height - 1) * strides.second + real_weight_height - in_height)); // Calculate individual paddings const unsigned int pad_left = pad_width / 2; @@ -265,8 +263,10 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh return same_info; } -std::pair deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height, - unsigned int kernel_width, unsigned int kernel_height, +std::pair deconvolution_output_dimensions(unsigned int in_width, + unsigned int in_height, + unsigned int kernel_width, + unsigned int kernel_height, const PadStrideInfo &pad_stride_info) { const unsigned int pad_left = pad_stride_info.pad_left(); @@ -285,8 +285,10 @@ std::pair deconvolution_output_dimensions(unsigned i return std::make_pair(w, h); } -std::pair scaled_dimensions(int width, int height, - int kernel_width, int kernel_height, +std::pair scaled_dimensions(int width, + int height, + int kernel_width, + int kernel_height, const PadStrideInfo &pad_stride_info, const Size2D &dilation) { @@ -300,15 +302,25 @@ std::pair scaled_dimensions(int width, int height, const int stride_y = pad_stride_info.stride().second; int w = 0; int h = 0; - switch(pad_stride_info.round()) + switch (pad_stride_info.round()) { case DimensionRoundingType::FLOOR: - w = static_cast(std::floor((static_cast(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1)); - h = static_cast(std::floor((static_cast(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1)); + w = static_cast(std::floor( + (static_cast(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + + 1)); + h = static_cast( + std::floor((static_cast(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / + stride_y) + + 1)); break; case DimensionRoundingType::CEIL: - w = static_cast(std::ceil((static_cast(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1)); - h = static_cast(std::ceil((static_cast(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1)); + w = static_cast(std::ceil( + (static_cast(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + + 1)); + h = static_cast( + std::ceil((static_cast(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / + stride_y) + + 1)); break; default: ARM_COMPUTE_ERROR("Unsupported rounding type"); @@ -319,9 +331,8 @@ std::pair scaled_dimensions(int width, int height, return std::make_pair(static_cast(w), static_cast(h)); } -std::pair scaled_dimensions_signed(int width, int height, - int kernel_width, int kernel_height, - const PadStrideInfo &pad_stride_info) +std::pair scaled_dimensions_signed( + int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info) { const int pad_left = pad_stride_info.pad_left(); const int pad_top = pad_stride_info.pad_top(); @@ -331,15 +342,19 @@ std::pair scaled_dimensions_signed(int width, int height, const int stride_y = pad_stride_info.stride().second; int w = 0; int h = 0; - switch(pad_stride_info.round()) + switch (pad_stride_info.round()) { case DimensionRoundingType::FLOOR: - w = static_cast(std::floor((static_cast(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); - h = static_cast(std::floor((static_cast(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); + w = static_cast( + std::floor((static_cast(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); + h = static_cast( + std::floor((static_cast(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); break; case DimensionRoundingType::CEIL: - w = static_cast(std::ceil((static_cast(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); - h = static_cast(std::ceil((static_cast(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); + w = static_cast( + std::ceil((static_cast(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); + h = static_cast( + std::ceil((static_cast(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); break; default: ARM_COMPUTE_ERROR("Unsupported rounding type"); @@ -348,8 +363,12 @@ std::pair scaled_dimensions_signed(int width, int height, return std::make_pair(static_cast(w), static_cast(h)); } -std::tuple scaled_3d_dimensions_signed(int width, int height, int depth, - int kernel_width, int kernel_height, int kernel_depth, +std::tuple scaled_3d_dimensions_signed(int width, + int height, + int depth, + int kernel_width, + int kernel_height, + int kernel_depth, const Pooling3dLayerInfo &pool3d_info) { const int pad_left = pool3d_info.padding.left; @@ -365,17 +384,23 @@ std::tuple scaled_3d_dimensions_signed(int width, int height, int int h = 0; int d = 0; - switch(pool3d_info.round_type) + switch (pool3d_info.round_type) { case DimensionRoundingType::FLOOR: - w = static_cast(std::floor((static_cast(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); - h = static_cast(std::floor((static_cast(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); - d = static_cast(std::floor((static_cast(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1)); + w = static_cast( + std::floor((static_cast(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); + h = static_cast( + std::floor((static_cast(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); + d = static_cast( + std::floor((static_cast(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1)); break; case DimensionRoundingType::CEIL: - w = static_cast(std::ceil((static_cast(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); - h = static_cast(std::ceil((static_cast(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); - d = static_cast(std::ceil((static_cast(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1)); + w = static_cast( + std::ceil((static_cast(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); + h = static_cast( + std::ceil((static_cast(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); + d = static_cast( + std::ceil((static_cast(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1)); break; default: ARM_COMPUTE_ERROR("Unsupported rounding type"); @@ -400,9 +425,9 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool // * Softmax with QASYMM8_SIGNED: scale = 1/256, offset = -128 // * LogSoftmax with QASYMM8: scale = 1/256, offset = 0 // * LogSoftmax with QASYMM8_SIGNED: scale = 16/256, offset = 127 - if(is_data_type_quantized_asymmetric_signed(input_type)) + if (is_data_type_quantized_asymmetric_signed(input_type)) { - if(is_log) + if (is_log) { return QuantizationInfo(16.f / 256, 127); } @@ -414,17 +439,21 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool return QuantizationInfo(1.f / 256, 0); } -std::pair get_quantized_activation_min_max(const ActivationLayerInfo &act_info, DataType data_type, UniformQuantizationInfo oq_info) +std::pair get_quantized_activation_min_max(const ActivationLayerInfo &act_info, + DataType data_type, + UniformQuantizationInfo oq_info) { const bool is_qasymm8_signed = is_data_type_quantized_asymmetric_signed(data_type); const auto a = act_info.a(); const auto b = act_info.b(); - const int a_int = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info); - const int b_int = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info); - const auto type_max_value = std::get<1>(get_min_max(data_type)).get(); + const int a_int = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info); + const int b_int = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info); + const auto type_max_value = std::get<1>(get_min_max(data_type)).get(); - const int32_t min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int; - const int32_t max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int; + const int32_t min_activation = + act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int; + const int32_t max_activation = + act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int; return std::make_pair(min_activation, max_activation); } @@ -433,11 +462,11 @@ std::unordered_map get_padding_info(std::initi { std::unordered_map res; - for(const ITensor *tensor : tensors) + for (const ITensor *tensor : tensors) { - if(tensor) + if (tensor) { - res.insert({ tensor->info(), tensor->info()->padding() }); + res.insert({tensor->info(), tensor->info()->padding()}); } } @@ -448,11 +477,11 @@ std::unordered_map get_padding_info(std::initi { std::unordered_map res; - for(const ITensorInfo *info : infos) + for (const ITensorInfo *info : infos) { - if(info) + if (info) { - res.insert({ info, info->padding() }); + res.insert({info, info->padding()}); } } @@ -461,17 +490,20 @@ std::unordered_map get_padding_info(std::initi bool has_padding_changed(const std::unordered_map &padding_map) { - return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair &padding_info) - { - return (padding_info.first->padding() != padding_info.second); - }) - != padding_map.end(); + return std::find_if(padding_map.begin(), padding_map.end(), + [](const std::pair &padding_info) + { return (padding_info.first->padding() != padding_info.second); }) != padding_map.end(); } #ifdef ARM_COMPUTE_ASSERTS_ENABLED -void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim) +void print_consecutive_elements(std::ostream &s, + DataType dt, + const uint8_t *ptr, + unsigned int n, + int stream_width, + const std::string &element_delim) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -481,36 +513,46 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr case DataType::QSYMM8: case DataType::QASYMM8_SIGNED: case DataType::QSYMM8_PER_CHANNEL: - print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, + element_delim); break; case DataType::U16: case DataType::QASYMM16: - print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, + element_delim); break; case DataType::S16: case DataType::QSYMM16: - print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, + element_delim); break; case DataType::U32: - print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, + element_delim); break; case DataType::S32: - print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, + element_delim); break; case DataType::U64: - print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, + element_delim); break; case DataType::S64: - print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, + element_delim); break; case DataType::BFLOAT16: - print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, + element_delim); break; case DataType::F16: - print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, + element_delim); break; case DataType::F32: - print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, + element_delim); break; default: ARM_COMPUTE_ERROR("Undefined element size for given data type"); @@ -519,7 +561,7 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp index 5a6486e11e..d8f796193e 100644 --- a/src/core/Validate.cpp +++ b/src/core/Validate.cpp @@ -23,13 +23,16 @@ */ #include "arm_compute/core/Validate.h" -arm_compute::Status arm_compute::error_on_mismatching_windows(const char *function, const char *file, const int line, - const arm_compute::Window &full, const arm_compute::Window &win) +arm_compute::Status arm_compute::error_on_mismatching_windows(const char *function, + const char *file, + const int line, + const arm_compute::Window &full, + const arm_compute::Window &win) { full.validate(); win.validate(); - for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i) + for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() != win[i].start(), function, file, line); ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() != win[i].end(), function, file, line); @@ -38,13 +41,16 @@ arm_compute::Status arm_compute::error_on_mismatching_windows(const char *functi return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function, const char *file, const int line, - const arm_compute::Window &full, const arm_compute::Window &sub) +arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function, + const char *file, + const int line, + const arm_compute::Window &full, + const arm_compute::Window &sub) { full.validate(); sub.validate(); - for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i) + for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() > sub[i].start(), function, file, line); ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() < sub[i].end(), function, file, line); @@ -54,8 +60,12 @@ arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line, - const arm_compute::Window &full, const arm_compute::Window &window, const int dim) +arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, + const char *file, + const int line, + const arm_compute::Window &full, + const arm_compute::Window &window, + const int dim) { full.validate(); window.validate(); @@ -67,65 +77,73 @@ arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(co return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line, - const arm_compute::Coordinates &pos, unsigned int max_dim) +arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte( + const char *function, const char *file, const int line, const arm_compute::Coordinates &pos, unsigned int max_dim) { - for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i) + for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(pos[i] != 0, function, file, line); } return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_window_dimensions_gte(const char *function, const char *file, const int line, - const arm_compute::Window &win, unsigned int max_dim) +arm_compute::Status arm_compute::error_on_window_dimensions_gte( + const char *function, const char *file, const int line, const arm_compute::Window &win, unsigned int max_dim) { - for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i) + for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i) { - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR((win[i].start() != 0) || (win[i].end() != win[i].step()), - function, file, line, - "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR( + (win[i].start() != 0) || (win[i].end() != win[i].step()), function, file, line, + "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i); } return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line, +arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, + const char *file, + const int line, const arm_compute::ITensor *tensor) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor->info() == nullptr, function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2, - function, file, line, - "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->info()->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2, function, file, line, + "Only 2D Tensors are supported by this kernel (%zu passed)", + tensor->info()->num_dimensions()); return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line, +arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, + const char *file, + const int line, const arm_compute::ITensorInfo *tensor) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2, - function, file, line, - "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2, function, file, line, + "Only 2D Tensors are supported by this kernel (%zu passed)", + tensor->num_dimensions()); return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char *function, const char *file, const int line, - arm_compute::Format fmt, arm_compute::Channel cn) +arm_compute::Status arm_compute::error_on_channel_not_in_known_format( + const char *function, const char *file, const int line, arm_compute::Format fmt, arm_compute::Channel cn) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(fmt == arm_compute::Format::UNKNOWN, function, file, line); ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == arm_compute::Channel::UNKNOWN, function, file, line); - switch(fmt) + switch (fmt) { case arm_compute::Format::RGB888: - arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B); + arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, + arm_compute::Channel::G, arm_compute::Channel::B); break; case arm_compute::Format::RGBA8888: - arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B, arm_compute::Channel::A); + arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, + arm_compute::Channel::G, arm_compute::Channel::B, + arm_compute::Channel::A); break; case arm_compute::Format::UV88: - arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U, arm_compute::Channel::V); + arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U, + arm_compute::Channel::V); break; case arm_compute::Format::IYUV: case arm_compute::Format::UYVY422: @@ -133,7 +151,8 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char case arm_compute::Format::NV12: case arm_compute::Format::NV21: case arm_compute::Format::YUV444: - arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y, arm_compute::Channel::U, arm_compute::Channel::V); + arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y, + arm_compute::Channel::U, arm_compute::Channel::V); break; default: ARM_COMPUTE_ERROR_LOC(function, file, line, "Not supported format."); @@ -141,21 +160,26 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *function, const char *file, const int line, +arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *function, + const char *file, + const int line, const arm_compute::IKernel *kernel) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(kernel == nullptr, function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(), - function, file, line, + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(), function, file, line, "This kernel hasn't been configured."); return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line, - const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape) +arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function, + const char *file, + const int line, + const TensorShape &parent_shape, + const Coordinates &coords, + const TensorShape &shape) { // Check dimensions - for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i) + for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i) { const bool invalid_idx = coords[i] >= static_cast(parent_shape[i]); const bool out_of_bounds_size = coords[i] + static_cast(shape[i]) > static_cast(parent_shape[i]); @@ -164,15 +188,20 @@ arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line, - const ValidRegion &parent_valid_region, const ValidRegion &valid_region) +arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char *function, + const char *file, + const int line, + const ValidRegion &parent_valid_region, + const ValidRegion &valid_region) { // Check valid regions - for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d) + for (unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d) { ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast(valid_region.shape[d])), - function, file, line); + ARM_COMPUTE_RETURN_ERROR_ON_LOC( + (parent_valid_region.anchor[d] + static_cast(parent_valid_region.shape[d])) < + (valid_region.anchor[d] + static_cast(valid_region.shape[d])), + function, file, line); } return arm_compute::Status{}; diff --git a/src/core/common/Macros.h b/src/core/common/Macros.h index d791154e5c..bc0ea29911 100644 --- a/src/core/common/Macros.h +++ b/src/core/common/Macros.h @@ -25,9 +25,9 @@ #define ARM_COMPUTE_COMMON_MACROS_H #define ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(TypeName) \ - TypeName(const TypeName &) = delete; \ + TypeName(const TypeName &) = delete; \ TypeName &operator=(const TypeName &) = delete; \ TypeName(TypeName &&) = default; \ - TypeName &operator=(TypeName &&) = default + TypeName &operator=(TypeName &&) = default #endif /* ARM_COMPUTE_COMMON_MACROS_H */ diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h index d6dc3449fc..686304b8d7 100644 --- a/src/core/common/Registrars.h +++ b/src/core/common/Registrars.h @@ -46,7 +46,7 @@ #else /* !defined(ENABLE_FP16_KERNELS) */ #define REGISTER_FP16_NEON(func_name) nullptr -#define REGISTER_FP16_SVE(func_name) nullptr +#define REGISTER_FP16_SVE(func_name) nullptr #define REGISTER_FP16_SVE2(func_name) nullptr #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ @@ -72,7 +72,7 @@ #else /* defined(ENABLE_FP32_KERNELS) */ #define REGISTER_FP32_NEON(func_name) nullptr -#define REGISTER_FP32_SVE(func_name) nullptr +#define REGISTER_FP32_SVE(func_name) nullptr #define REGISTER_FP32_SVE2(func_name) nullptr #endif /* defined(ENABLE_FP32_KERNELS) */ @@ -94,7 +94,7 @@ #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */ #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr -#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr +#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr #define REGISTER_QASYMM8_SIGNED_SVE2(func_name) nullptr #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */ @@ -115,7 +115,7 @@ #else /* defined(ENABLE_QASYMM8_KERNELS) */ #define REGISTER_QASYMM8_NEON(func_name) nullptr -#define REGISTER_QASYMM8_SVE(func_name) nullptr +#define REGISTER_QASYMM8_SVE(func_name) nullptr #define REGISTER_QASYMM8_SVE2(func_name) nullptr #endif /* defined(ENABLE_QASYMM8_KERNELS) */ @@ -137,7 +137,7 @@ #else /* defined(ENABLE_QSYMM16_KERNELS) */ #define REGISTER_QSYMM16_NEON(func_name) nullptr -#define REGISTER_QSYMM16_SVE(func_name) nullptr +#define REGISTER_QSYMM16_SVE(func_name) nullptr #define REGISTER_QSYMM16_SVE2(func_name) nullptr #endif /* defined(ENABLE_QSYMM16_KERNELS) */ @@ -169,7 +169,7 @@ #else /* defined(ENABLE_INTEGER_KERNELS) */ #define REGISTER_INTEGER_NEON(func_name) nullptr -#define REGISTER_INTEGER_SVE(func_name) nullptr +#define REGISTER_INTEGER_SVE(func_name) nullptr #define REGISTER_INTEGER_SVE2(func_name) nullptr #endif /* defined(ENABLE_INTEGER_KERNELS) */ diff --git a/src/core/helpers/AutoConfiguration.h b/src/core/helpers/AutoConfiguration.h index 8715dcd74b..9df2a76983 100644 --- a/src/core/helpers/AutoConfiguration.h +++ b/src/core/helpers/AutoConfiguration.h @@ -24,9 +24,9 @@ #ifndef SRC_CORE_HELPERS_AUTOCONFIGURATION_H #define SRC_CORE_HELPERS_AUTOCONFIGURATION_H -#include "arm_compute/core/utils/DataTypeUtils.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/DataTypeUtils.h" namespace arm_compute { @@ -42,10 +42,11 @@ namespace arm_compute */ inline bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, - int num_channels, DataType data_type, - QuantizationInfo quantization_info = QuantizationInfo()) + int num_channels, + DataType data_type, + QuantizationInfo quantization_info = QuantizationInfo()) { - if(info.tensor_shape().total_size() == 0) + if (info.tensor_shape().total_size() == 0) { info.set_data_type(data_type); info.set_num_channels(num_channels); @@ -70,7 +71,7 @@ inline bool auto_init_if_empty(ITensorInfo &info, */ inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source) { - if(info_sink.tensor_shape().total_size() == 0) + if (info_sink.tensor_shape().total_size() == 0) { info_sink.set_data_type(info_source.data_type()); info_sink.set_num_channels(info_source.num_channels()); @@ -93,7 +94,7 @@ inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_s */ inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape) { - if(info.tensor_shape().total_size() == 0) + if (info.tensor_shape().total_size() == 0) { info.set_tensor_shape(shape); return true; @@ -112,7 +113,7 @@ inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape) */ inline bool set_format_if_unknown(ITensorInfo &info, Format format) { - if(info.data_type() == DataType::UNKNOWN) + if (info.data_type() == DataType::UNKNOWN) { info.set_format(format); return true; @@ -131,7 +132,7 @@ inline bool set_format_if_unknown(ITensorInfo &info, Format format) */ inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type) { - if(info.data_type() == DataType::UNKNOWN) + if (info.data_type() == DataType::UNKNOWN) { info.set_data_type(data_type); return true; @@ -150,7 +151,7 @@ inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type) */ inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout) { - if(info.data_layout() == DataLayout::UNKNOWN) + if (info.data_layout() == DataLayout::UNKNOWN) { info.set_data_layout(data_layout); return true; @@ -169,7 +170,7 @@ inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout */ inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info) { - if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type()))) + if (info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type()))) { info.set_quantization_info(quantization_info); return true; diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h index a41052687b..dd094b414c 100644 --- a/src/core/helpers/MemoryHelpers.h +++ b/src/core/helpers/MemoryHelpers.h @@ -24,9 +24,9 @@ #ifndef SRC_COMMON_MEMORY_HELPERS_H #define SRC_COMMON_MEMORY_HELPERS_H +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/MemoryGroup.h" #include @@ -43,18 +43,17 @@ inline int offset_int_vec(int offset) template struct WorkspaceDataElement { - int slot{ -1 }; - experimental::MemoryLifetime lifetime{ experimental::MemoryLifetime::Temporary }; - std::unique_ptr tensor{ nullptr }; + int slot{-1}; + experimental::MemoryLifetime lifetime{experimental::MemoryLifetime::Temporary}; + std::unique_ptr tensor{nullptr}; }; template using WorkspaceData = std::vector>; template -WorkspaceData manage_workspace(const experimental::MemoryRequirements &mem_reqs, - MemoryGroup &mgroup, - ITensorPack &run_pack) +WorkspaceData +manage_workspace(const experimental::MemoryRequirements &mem_reqs, MemoryGroup &mgroup, ITensorPack &run_pack) { ITensorPack dummy_pack = ITensorPack(); return manage_workspace(mem_reqs, mgroup, run_pack, dummy_pack); @@ -63,24 +62,26 @@ WorkspaceData manage_workspace(const experimental::MemoryRequirement template WorkspaceData manage_workspace(const experimental::MemoryRequirements &mem_reqs, MemoryGroup &mgroup, - ITensorPack &run_pack, ITensorPack &prep_pack) + ITensorPack &run_pack, + ITensorPack &prep_pack) { WorkspaceData workspace_memory; - for(const auto &req : mem_reqs) + for (const auto &req : mem_reqs) { - if(req.size == 0) + if (req.size == 0) { continue; } - const auto aux_info = TensorInfo{ TensorShape(req.size), 1, DataType::U8 }; - workspace_memory.emplace_back(WorkspaceDataElement { req.slot, req.lifetime, std::make_unique() }); + const auto aux_info = TensorInfo{TensorShape(req.size), 1, DataType::U8}; + workspace_memory.emplace_back( + WorkspaceDataElement{req.slot, req.lifetime, std::make_unique()}); auto aux_tensor = workspace_memory.back().tensor.get(); ARM_COMPUTE_ERROR_ON_NULLPTR(aux_tensor); aux_tensor->allocator()->init(aux_info, req.alignment); - if(req.lifetime == experimental::MemoryLifetime::Temporary) + if (req.lifetime == experimental::MemoryLifetime::Temporary) { mgroup.manage(aux_tensor); } @@ -91,7 +92,7 @@ WorkspaceData manage_workspace(const experimental::MemoryRequirement run_pack.add_tensor(req.slot, aux_tensor); } - for(auto &mem : workspace_memory) + for (auto &mem : workspace_memory) { auto tensor = mem.tensor.get(); tensor->allocator()->allocate(); @@ -103,31 +104,29 @@ WorkspaceData manage_workspace(const experimental::MemoryRequirement template void release_prepare_tensors(WorkspaceData &workspace, ITensorPack &prep_pack) { - workspace.erase(std::remove_if(workspace.begin(), - workspace.end(), - [&prep_pack](auto & wk) - { - const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare; - if(to_erase) - { - prep_pack.remove_tensor(wk.slot); - } - return to_erase; - }), - workspace.end()); + workspace.erase(std::remove_if(workspace.begin(), workspace.end(), + [&prep_pack](auto &wk) + { + const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare; + if (to_erase) + { + prep_pack.remove_tensor(wk.slot); + } + return to_erase; + }), + workspace.end()); } /** Utility function to release tensors with lifetime marked as Prepare */ template -void release_temporaries(const experimental::MemoryRequirements &mem_reqs, - WorkspaceData &workspace) +void release_temporaries(const experimental::MemoryRequirements &mem_reqs, WorkspaceData &workspace) { - for(auto &ws : workspace) + for (auto &ws : workspace) { const int slot = ws.slot; - for(auto &m : mem_reqs) + for (auto &m : mem_reqs) { - if(m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare) + if (m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare) { auto tensor = ws.tensor.get(); tensor->allocator()->free(); diff --git a/src/core/helpers/PoolingHelpers.h b/src/core/helpers/PoolingHelpers.h index 079629ee6a..9ef045f472 100644 --- a/src/core/helpers/PoolingHelpers.h +++ b/src/core/helpers/PoolingHelpers.h @@ -33,8 +33,20 @@ namespace cpu namespace { -inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w, - const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z) +inline float calculate_avg_scale_pool3d(bool exclude_padding, + const Coordinates &id, + const int pool_size_x, + const int pool_size_y, + const int pool_size_z, + const int upper_bound_w, + const int upper_bound_h, + const int upper_bound_d, + const int pad_x, + const int pad_y, + const int pad_z, + const int stride_x, + const int stride_y, + const int stride_z) { // Based on NDHWC int start_x = id[1] * stride_x - pad_x; @@ -44,7 +56,7 @@ inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates const int end_x = std::min(start_x + pool_size_x, upper_bound_w); const int end_y = std::min(start_y + pool_size_y, upper_bound_h); const int end_z = std::min(start_z + pool_size_z, upper_bound_d); - if(exclude_padding) + if (exclude_padding) { start_x = std::max(0, start_x); start_y = std::max(0, start_y); @@ -53,8 +65,17 @@ inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z)); } -inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, - const int pad_x, const int pad_y, const int stride_x, const int stride_y) +inline float calculate_avg_scale_pool2d(bool exclude_padding, + DataLayout data_layout, + const Coordinates &id, + const int pool_size_x, + const int pool_size_y, + const int upper_bound_w, + const int upper_bound_h, + const int pad_x, + const int pad_y, + const int stride_x, + const int stride_y) { const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); @@ -64,7 +85,7 @@ inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_la const int end_x = std::min(start_x + pool_size_x, upper_bound_w); const int end_y = std::min(start_y + pool_size_y, upper_bound_h); - if(exclude_padding) + if (exclude_padding) { start_x = std::max(0, start_x); start_y = std::max(0, start_y); @@ -117,17 +138,26 @@ inline float32x4_t vcvtq_f32_q32(int32x4_t values) } template -inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset); +inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, + const float quant_rescale, + const float scale_pooling, + const int32_t new_offset); template <> -inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset) +inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, + const float quant_rescale, + const float scale_pooling, + const int32_t new_offset) { const float new_scale = quant_rescale / scale_pooling; return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset)); } template <> -inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset) +inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, + const float quant_rescale, + const float scale_pooling, + const int32_t new_offset) { const float new_scale = quant_rescale / scale_pooling; return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset)); @@ -139,30 +169,24 @@ inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInf template <> inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo) { - const float32x4x4_t acc = - { - { - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))), - } - }; + const float32x4x4_t acc = {{ + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))), + }}; return vquantize(acc, requant_qinfo); } template <> inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo) { - const float32x4x4_t acc = - { - { - vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))), - } - }; + const float32x4x4_t acc = {{ + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))), + }}; return vquantize_signed(acc, requant_qinfo); } @@ -172,26 +196,20 @@ inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinf template <> inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo) { - const float32x4x2_t acc = - { - { - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))), - } - }; + const float32x4x2_t acc = {{ + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))), + }}; return vquantize(acc, requant_qinfo); } template <> inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo) { - const float32x4x2_t acc = - { - { - vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))), - } - }; + const float32x4x2_t acc = {{ + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))), + }}; return vquantize_signed(acc, requant_qinfo); } @@ -199,4 +217,3 @@ inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo } // namespace cpu } // namespace arm_compute #endif /* SRC_CORE_HELPERS_POOLINGHELPERS_H */ - diff --git a/src/core/helpers/ScaleHelpers.h b/src/core/helpers/ScaleHelpers.h index e769bba782..47605e7385 100644 --- a/src/core/helpers/ScaleHelpers.h +++ b/src/core/helpers/ScaleHelpers.h @@ -50,8 +50,12 @@ namespace scale_helpers * * @return The bilinear interpolated pixel value */ -inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy, - UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info) +inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, + size_t stride, + float dx, + float dy, + UniformQuantizationInfo iq_info, + UniformQuantizationInfo oq_info) { ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr); @@ -85,8 +89,12 @@ inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stri * * @return The bilinear interpolated pixel value */ -inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy, - UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info) +inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, + size_t stride, + float dx, + float dy, + UniformQuantizationInfo iq_info, + UniformQuantizationInfo oq_info) { ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr); @@ -122,9 +130,8 @@ inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride * * @return The pixel at (x, y) using area interpolation. */ -inline uint8_t -pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, - float hr, int x, int y) +inline uint8_t pixel_area_c1u8_clamp( + const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y) { ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr); @@ -159,7 +166,7 @@ pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t widt // Sum pixels in area int sum = 0; - for(int j = yi + y_from, je = yi + y_to; j <= je; ++j) + for (int j = yi + y_from, je = yi + y_to; j <= je; ++j) { const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from; sum = std::accumulate(ptr, ptr + x_elements, sum); diff --git a/src/core/helpers/SoftmaxHelpers.cpp b/src/core/helpers/SoftmaxHelpers.cpp index 71b971af31..8184991ab5 100644 --- a/src/core/helpers/SoftmaxHelpers.cpp +++ b/src/core/helpers/SoftmaxHelpers.cpp @@ -29,7 +29,7 @@ namespace softmax_helpers { PermutationVector get_permutation_vector_from_softmax_axis(size_t axis) { - switch(axis) + switch (axis) { case 1: return PermutationVector(1U, 0U, 2U, 3U); diff --git a/src/core/helpers/Utils.cpp b/src/core/helpers/Utils.cpp index 3900475355..6ca29d180d 100644 --- a/src/core/helpers/Utils.cpp +++ b/src/core/helpers/Utils.cpp @@ -31,9 +31,9 @@ bool has_holes(const ITensorInfo &info, size_t dimension) const auto &strides = info.strides_in_bytes(); size_t squashed_bytes = info.element_size(); - for(size_t dim = 0; dim <= dimension; ++dim) + for (size_t dim = 0; dim <= dimension; ++dim) { - if(strides[dim] != squashed_bytes) + if (strides[dim] != squashed_bytes) { return true; } diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h index 7ad960bfa2..2e7224c55b 100644 --- a/src/core/helpers/Utils.h +++ b/src/core/helpers/Utils.h @@ -45,7 +45,7 @@ inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&...fixe // Create strides object Strides strides(stride_x, fixed_strides...); - for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i) + for (size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i) { strides.set(i, shape[i - 1] * strides[i - 1]); } diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp index a4d46db352..30a55fcbc6 100644 --- a/src/core/helpers/WindowHelpers.cpp +++ b/src/core/helpers/WindowHelpers.cpp @@ -25,9 +25,10 @@ namespace arm_compute { -Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size) +Window +calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size) { - if(!skip_border) + if (!skip_border) { border_size = BorderSize(0); } @@ -38,40 +39,47 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, Window window; window.set(0, Window::Dimension( - // Skip the border left of the image - anchor[0] + border_size.left, - // Skip the border right of the image - // Make sure the window width is a multiple of the step size - anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast(shape[0]) - static_cast(border_size.left) - static_cast(border_size.right)), steps[0]), - steps[0])); + // Skip the border left of the image + anchor[0] + border_size.left, + // Skip the border right of the image + // Make sure the window width is a multiple of the step size + anchor[0] + border_size.left + + ceil_to_multiple(std::max(0, static_cast(shape[0]) - static_cast(border_size.left) - + static_cast(border_size.right)), + steps[0]), + steps[0])); size_t n = 1; - if(anchor.num_dimensions() > 1) + if (anchor.num_dimensions() > 1) { - window.set(1, Window::Dimension( + window.set(1, + Window::Dimension( // Skip the border above the image anchor[1] + border_size.top, // Skip the border below the image - anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast(shape[1]) - static_cast(border_size.top) - static_cast(border_size.bottom)), steps[1]), + anchor[1] + border_size.top + + ceil_to_multiple(std::max(0, static_cast(shape[1]) - static_cast(border_size.top) - + static_cast(border_size.bottom)), + steps[1]), steps[1])); ++n; } - if(anchor.num_dimensions() > 2) + if (anchor.num_dimensions() > 2) { window.set(2, Window::Dimension(anchor[2], std::max(1, shape[2]), steps[2])); ++n; } - for(; n < anchor.num_dimensions(); ++n) + for (; n < anchor.num_dimensions(); ++n) { window.set(n, Window::Dimension(anchor[n], std::max(1, shape[n]))); } - for(; n < Coordinates::num_max_dimensions; ++n) + for (; n < Coordinates::num_max_dimensions; ++n) { window.set(n, Window::Dimension(0, 1)); } @@ -81,7 +89,7 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool skip_border, BorderSize border_size) { - if(!skip_border) + if (!skip_border) { border_size = BorderSize(0); } @@ -89,40 +97,46 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool s Window window; window.set(0, Window::Dimension( - // Skip the border left of the image - border_size.left, - // Skip the border right of the image - // Make sure the window width is a multiple of the step size - border_size.left + ceil_to_multiple(std::max(0, static_cast(shape[0]) - static_cast(border_size.left) - static_cast(border_size.right)), steps[0]), - steps[0])); + // Skip the border left of the image + border_size.left, + // Skip the border right of the image + // Make sure the window width is a multiple of the step size + border_size.left + + ceil_to_multiple(std::max(0, static_cast(shape[0]) - static_cast(border_size.left) - + static_cast(border_size.right)), + steps[0]), + steps[0])); size_t n = 1; - if(shape.num_dimensions() > 1) + if (shape.num_dimensions() > 1) { window.set(1, Window::Dimension( - // Skip the border above the image - border_size.top, - // Skip the border below the image - border_size.top + ceil_to_multiple(std::max(0, static_cast(shape[1]) - static_cast(border_size.top) - static_cast(border_size.bottom)), steps[1]), - steps[1])); + // Skip the border above the image + border_size.top, + // Skip the border below the image + border_size.top + ceil_to_multiple(std::max(0, static_cast(shape[1]) - + static_cast(border_size.top) - + static_cast(border_size.bottom)), + steps[1]), + steps[1])); ++n; } - if(shape.num_dimensions() > 2) + if (shape.num_dimensions() > 2) { window.set(2, Window::Dimension(0, std::max(1, shape[2]), steps[2])); ++n; } - for(; n < shape.num_dimensions(); ++n) + for (; n < shape.num_dimensions(); ++n) { window.set(n, Window::Dimension(0, std::max(1, shape[n]))); } - for(; n < Coordinates::num_max_dimensions; ++n) + for (; n < Coordinates::num_max_dimensions; ++n) { window.set(n, Window::Dimension(0, 1)); } @@ -138,40 +152,42 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step Window window; window.set(0, Window::Dimension( - // move the anchor to the start from the border - anchor[0] - border_size.left, - // move the anchor to include the right end border - // Make sure the window width is a multiple of the step size - anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]), - steps[0])); + // move the anchor to the start from the border + anchor[0] - border_size.left, + // move the anchor to include the right end border + // Make sure the window width is a multiple of the step size + anchor[0] - border_size.left + + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]), + steps[0])); size_t n = 1; - if(anchor.num_dimensions() > 1) + if (anchor.num_dimensions() > 1) { window.set(1, Window::Dimension( - // Include the border above the image - anchor[1] - border_size.top, - // Include the border below the image - anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]), - steps[1])); + // Include the border above the image + anchor[1] - border_size.top, + // Include the border below the image + anchor[1] - border_size.top + + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]), + steps[1])); ++n; } - if(anchor.num_dimensions() > 2) + if (anchor.num_dimensions() > 2) { window.set(2, Window::Dimension(0, std::max(1, shape[n]), steps[2])); ++n; } - for(; n < anchor.num_dimensions(); ++n) + for (; n < anchor.num_dimensions(); ++n) { window.set(n, Window::Dimension(anchor[n], std::max(1, shape[n]))); } - for(; n < Coordinates::num_max_dimensions; ++n) + for (; n < Coordinates::num_max_dimensions; ++n) { window.set(n, Window::Dimension(0, 1)); } @@ -179,9 +195,12 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step return window; } -Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size) +Window calculate_max_window_horizontal(const ValidRegion &valid_region, + const Steps &steps, + bool skip_border, + BorderSize border_size) { - if(skip_border) + if (skip_border) { border_size.top = 0; border_size.bottom = 0; @@ -198,33 +217,35 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St Window window; window.set(0, Window::Dimension( - // Skip the border left of the image - anchor[0] + border_size.left, - // Skip the border right of the image - // Make sure the window width is a multiple of the step size - anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast(shape[0]) - static_cast(border_size.left) - static_cast(border_size.right)), steps[0]), - steps[0])); + // Skip the border left of the image + anchor[0] + border_size.left, + // Skip the border right of the image + // Make sure the window width is a multiple of the step size + anchor[0] + border_size.left + + ceil_to_multiple(std::max(0, static_cast(shape[0]) - static_cast(border_size.left) - + static_cast(border_size.right)), + steps[0]), + steps[0])); size_t n = 1; - if(anchor.num_dimensions() > 1) + if (anchor.num_dimensions() > 1) { window.set(1, Window::Dimension( - // Skip the border above the image - anchor[1] - border_size.top, - // Skip the border below the image - anchor[1] + shape[1] + border_size.bottom, - 1)); + // Skip the border above the image + anchor[1] - border_size.top, + // Skip the border below the image + anchor[1] + shape[1] + border_size.bottom, 1)); ++n; } - for(; n < anchor.num_dimensions(); ++n) + for (; n < anchor.num_dimensions(); ++n) { window.set(n, Window::Dimension(anchor[n], std::max(1, shape[n]))); } - for(; n < Coordinates::num_max_dimensions; ++n) + for (; n < Coordinates::num_max_dimensions; ++n) { window.set(n, Window::Dimension(0, 1)); } @@ -247,9 +268,9 @@ std::pair calculate_squashed_or_max_window(const ITensorInfo &sr size_t squashed_bytes = src0.element_size(); // Try to squash the low dimensions together. - for(; dim < num_dimensions; ++dim) + for (; dim < num_dimensions; ++dim) { - if(shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes) + if (shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes) { break; } @@ -257,7 +278,7 @@ std::pair calculate_squashed_or_max_window(const ITensorInfo &sr squashed_bytes *= shape0[dim]; } - if(dim == num_dimensions) + if (dim == num_dimensions) { auto squashed_elements = squashed_bytes / src0.element_size(); @@ -266,7 +287,7 @@ std::pair calculate_squashed_or_max_window(const ITensorInfo &sr // The input tensors can be interpreted as 1D array. win.set(0, Window::Dimension(0, squashed_elements, 1)); - for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim) + for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim) { win.set(dim, Window::Dimension(0, 1, 1)); } @@ -274,7 +295,7 @@ std::pair calculate_squashed_or_max_window(const ITensorInfo &sr else { // Generates the max window. - for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim) + for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim) { win.set(dim, Window::Dimension(0, std::max(shape0[dim], shape1[dim]), 1)); } @@ -295,21 +316,21 @@ std::pair calculate_squashed_or_max_window(const ITensorInfo &sr size_t squashed_bytes = src.element_size(); // Try to squash the low dimensions together. - for(; dim < num_dimensions; ++dim) + for (; dim < num_dimensions; ++dim) { - if(strides[dim] != squashed_bytes) + if (strides[dim] != squashed_bytes) { break; } squashed_bytes *= shape[dim]; } - if(dim == num_dimensions) + if (dim == num_dimensions) { const auto squashed_elements = squashed_bytes / src.element_size(); split_dimension = Window::DimX; // The input tensor can be interpreted as 1D array. win.set(0, Window::Dimension(0, squashed_elements, 1)); - for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim) + for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim) { win.set(dim, Window::Dimension(0, 1, 1)); } @@ -317,7 +338,7 @@ std::pair calculate_squashed_or_max_window(const ITensorInfo &sr else { // Generate the max window. - for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim) + for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim) { win.set(dim, Window::Dimension(0, shape[dim], 1)); } diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h index eccf7f2d18..e404c18e8a 100644 --- a/src/core/helpers/WindowHelpers.h +++ b/src/core/helpers/WindowHelpers.h @@ -43,21 +43,13 @@ namespace arm_compute * influence the returned value. */ template -bool update_window_and_padding(Window &win, Ts &&... patterns) +bool update_window_and_padding(Window &win, Ts &&...patterns) { bool window_changed = false; - utility::for_each([&](const IAccessWindow & w) - { - window_changed |= w.update_window_if_needed(win); - }, - patterns...); + utility::for_each([&](const IAccessWindow &w) { window_changed |= w.update_window_if_needed(win); }, patterns...); - utility::for_each([&](IAccessWindow & w) - { - w.update_padding_if_needed(win); - }, - patterns...); + utility::for_each([&](IAccessWindow &w) { w.update_padding_if_needed(win); }, patterns...); return window_changed; } @@ -69,18 +61,18 @@ bool update_window_and_padding(Window &win, Ts &&... patterns) * @return Intersection of all regions. */ template -ValidRegion intersect_valid_regions(const Ts &... regions) +ValidRegion intersect_valid_regions(const Ts &...regions) { - auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion + auto intersect = [](const ValidRegion &r1, const ValidRegion &r2) -> ValidRegion { ValidRegion region; - for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d) + for (size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d) { region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d])); } - for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d) + for (size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d) { region.shape.set(d, std::min(r1.shape[d], r2.shape[d])); } @@ -101,7 +93,10 @@ ValidRegion intersect_valid_regions(const Ts &... regions) * * @return The maximum window the kernel can be executed on. */ -Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize()); +Window calculate_max_window(const ValidRegion &valid_region, + const Steps &steps = Steps(), + bool skip_border = false, + BorderSize border_size = BorderSize()); /** Calculate the maximum window for a given tensor shape and border setting * @@ -112,7 +107,10 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps * * @return The maximum window the kernel can be executed on. */ -Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize()); +Window calculate_max_window(const TensorShape &shape, + const Steps &steps = Steps(), + bool skip_border = false, + BorderSize border_size = BorderSize()); /** Calculate the maximum window for a given tensor shape and border setting * @@ -123,7 +121,10 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps * * @return The maximum window the kernel can be executed on. */ -inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize()) +inline Window calculate_max_window(const ITensorInfo &info, + const Steps &steps = Steps(), + bool skip_border = false, + BorderSize border_size = BorderSize()) { return calculate_max_window(info.tensor_shape(), steps, skip_border, border_size); } @@ -137,7 +138,10 @@ inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = * * @return The maximum window the kernel can be executed on. */ -Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize()); +Window calculate_max_window_horizontal(const ValidRegion &valid_region, + const Steps &steps = Steps(), + bool skip_border = false, + BorderSize border_size = BorderSize()); /** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting * @@ -148,7 +152,10 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St * * @return The maximum window the kernel can be executed on. */ -inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize()) +inline Window calculate_max_window_horizontal(const ITensorInfo &info, + const Steps &steps = Steps(), + bool skip_border = false, + BorderSize border_size = BorderSize()) { return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size); } @@ -161,7 +168,9 @@ inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Ste * * @return The maximum window the kernel can be executed on. */ -Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize()); +Window calculate_max_enlarged_window(const ValidRegion &valid_region, + const Steps &steps = Steps(), + BorderSize border_size = BorderSize()); /** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border. * @@ -171,7 +180,9 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step * * @return The maximum window the kernel can be executed on. */ -inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize()) +inline Window calculate_max_enlarged_window(const ITensorInfo &info, + const Steps &steps = Steps(), + BorderSize border_size = BorderSize()) { return calculate_max_enlarged_window(info.valid_region(), steps, border_size); } @@ -208,7 +219,7 @@ std::pair calculate_squashed_or_max_window(const ITensorInfo &sr * @return A pair of the shape and window */ template -std::pair compute_output_shape_and_window(const Shapes &... shapes) +std::pair compute_output_shape_and_window(const Shapes &...shapes) { const TensorShape out_shape = TensorShape::broadcast_shape(shapes...); return std::make_pair(out_shape, calculate_max_window(out_shape)); diff --git a/src/core/utils/ActivationFunctionUtils.cpp b/src/core/utils/ActivationFunctionUtils.cpp index 4854b8eb0b..017170a0c5 100644 --- a/src/core/utils/ActivationFunctionUtils.cpp +++ b/src/core/utils/ActivationFunctionUtils.cpp @@ -28,26 +28,24 @@ namespace arm_compute { -const std::string &string_from_activation_func(const ActivationFunction& act) +const std::string &string_from_activation_func(const ActivationFunction &act) { - static std::map act_map = - { - { ActivationFunction::ABS, "ABS" }, - { ActivationFunction::LINEAR, "LINEAR" }, - { ActivationFunction::LOGISTIC, "LOGISTIC" }, - { ActivationFunction::RELU, "RELU" }, - { ActivationFunction::BOUNDED_RELU, "BRELU" }, - { ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU" }, - { ActivationFunction::LEAKY_RELU, "LRELU" }, - { ActivationFunction::SOFT_RELU, "SRELU" }, - { ActivationFunction::ELU, "ELU" }, - { ActivationFunction::SQRT, "SQRT" }, - { ActivationFunction::SQUARE, "SQUARE" }, - { ActivationFunction::TANH, "TANH" }, - { ActivationFunction::IDENTITY, "IDENTITY" }, - { ActivationFunction::HARD_SWISH, "HARD_SWISH" }, - { ActivationFunction::SWISH, "SWISH" }, - { ActivationFunction::GELU, "GELU" } + static std::map act_map = {{ActivationFunction::ABS, "ABS"}, + {ActivationFunction::LINEAR, "LINEAR"}, + {ActivationFunction::LOGISTIC, "LOGISTIC"}, + {ActivationFunction::RELU, "RELU"}, + {ActivationFunction::BOUNDED_RELU, "BRELU"}, + {ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU"}, + {ActivationFunction::LEAKY_RELU, "LRELU"}, + {ActivationFunction::SOFT_RELU, "SRELU"}, + {ActivationFunction::ELU, "ELU"}, + {ActivationFunction::SQRT, "SQRT"}, + {ActivationFunction::SQUARE, "SQUARE"}, + {ActivationFunction::TANH, "TANH"}, + {ActivationFunction::IDENTITY, "IDENTITY"}, + {ActivationFunction::HARD_SWISH, "HARD_SWISH"}, + {ActivationFunction::SWISH, "SWISH"}, + {ActivationFunction::GELU, "GELU"} }; diff --git a/src/core/utils/AssemblyUtils.cpp b/src/core/utils/AssemblyUtils.cpp index 6d483adc7f..d97ea42091 100644 --- a/src/core/utils/AssemblyUtils.cpp +++ b/src/core/utils/AssemblyUtils.cpp @@ -34,12 +34,12 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act) arm_gemm::Activation gemm_act; // Early exit in case lower bound is other than 0, as it's not yet supported - if(act.b() != 0.f) + if (act.b() != 0.f) { return gemm_act; } - switch(act.activation()) + switch (act.activation()) { case ActivationLayerInfo::ActivationFunction::RELU: gemm_act.type = arm_gemm::Activation::Type::ReLU; @@ -63,17 +63,15 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act) arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info) { - return arm_conv::PaddingValues{ pad_stride_info.pad_left(), - pad_stride_info.pad_top(), - pad_stride_info.pad_right(), - pad_stride_info.pad_bottom() }; + return arm_conv::PaddingValues{pad_stride_info.pad_left(), pad_stride_info.pad_top(), pad_stride_info.pad_right(), + pad_stride_info.pad_bottom()}; } arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFormat &weight_format) { arm_gemm::WeightFormat gemm_weight_fromat; - switch(weight_format) + switch (weight_format) { case arm_compute::WeightFormat::UNSPECIFIED: gemm_weight_fromat = arm_gemm::WeightFormat::UNSPECIFIED; @@ -193,7 +191,7 @@ arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::Weigh { arm_compute::WeightFormat acl_weight_fromat; - switch(weight_format) + switch (weight_format) { case arm_gemm::WeightFormat::UNSPECIFIED: acl_weight_fromat = arm_compute::WeightFormat::UNSPECIFIED; diff --git a/src/core/utils/AssemblyUtils.h b/src/core/utils/AssemblyUtils.h index 60bad3b618..7d0d37c4ef 100644 --- a/src/core/utils/AssemblyUtils.h +++ b/src/core/utils/AssemblyUtils.h @@ -25,6 +25,7 @@ #define UTILS_CORE_ASSEMBLY_UTILS_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/kernels/assembly/common.hpp" #include "src/cpu/kernels/assembly/arm_gemm.hpp" @@ -65,6 +66,6 @@ arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFo * @return Compute Library WeightFormat */ arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::WeightFormat &weight_format); -} // namespace assembly +} // namespace assembly_utils } // namespace arm_compute #endif /* UTILS_CORE_ASSEMBLY_UTILS_H */ diff --git a/src/core/utils/DataLayoutUtils.cpp b/src/core/utils/DataLayoutUtils.cpp index 4919b79a42..234bed71cb 100644 --- a/src/core/utils/DataLayoutUtils.cpp +++ b/src/core/utils/DataLayoutUtils.cpp @@ -29,11 +29,10 @@ namespace arm_compute const std::string &string_from_data_layout(DataLayout dl) { - static std::map dl_map = - { - { DataLayout::UNKNOWN, "UNKNOWN" }, - { DataLayout::NCHW, "NCHW" }, - { DataLayout::NHWC, "NHWC" }, + static std::map dl_map = { + {DataLayout::UNKNOWN, "UNKNOWN"}, + {DataLayout::NCHW, "NCHW"}, + {DataLayout::NHWC, "NHWC"}, }; return dl_map[dl]; diff --git a/src/core/utils/DataTypeUtils.cpp b/src/core/utils/DataTypeUtils.cpp index 07999354d9..1394339987 100644 --- a/src/core/utils/DataTypeUtils.cpp +++ b/src/core/utils/DataTypeUtils.cpp @@ -30,27 +30,26 @@ namespace arm_compute { const std::string &string_from_data_type(DataType dt) { - static std::map dt_map = - { - { DataType::UNKNOWN, "UNKNOWN" }, - { DataType::S8, "S8" }, - { DataType::U8, "U8" }, - { DataType::S16, "S16" }, - { DataType::U16, "U16" }, - { DataType::S32, "S32" }, - { DataType::U32, "U32" }, - { DataType::S64, "S64" }, - { DataType::U64, "U64" }, - { DataType::F16, "F16" }, - { DataType::F32, "F32" }, - { DataType::F64, "F64" }, - { DataType::SIZET, "SIZET" }, - { DataType::QSYMM8, "QSYMM8" }, - { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" }, - { DataType::QASYMM8, "QASYMM8" }, - { DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED" }, - { DataType::QSYMM16, "QSYMM16" }, - { DataType::QASYMM16, "QASYMM16" }, + static std::map dt_map = { + {DataType::UNKNOWN, "UNKNOWN"}, + {DataType::S8, "S8"}, + {DataType::U8, "U8"}, + {DataType::S16, "S16"}, + {DataType::U16, "U16"}, + {DataType::S32, "S32"}, + {DataType::U32, "U32"}, + {DataType::S64, "S64"}, + {DataType::U64, "U64"}, + {DataType::F16, "F16"}, + {DataType::F32, "F32"}, + {DataType::F64, "F64"}, + {DataType::SIZET, "SIZET"}, + {DataType::QSYMM8, "QSYMM8"}, + {DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL"}, + {DataType::QASYMM8, "QASYMM8"}, + {DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED"}, + {DataType::QSYMM16, "QSYMM16"}, + {DataType::QASYMM16, "QASYMM16"}, }; return dt_map[dt]; @@ -58,12 +57,11 @@ const std::string &string_from_data_type(DataType dt) DataType data_type_from_name(const std::string &name) { - static const std::map data_types = - { - { "f16", DataType::F16 }, - { "f32", DataType::F32 }, - { "qasymm8", DataType::QASYMM8 }, - { "qasymm8_signed", DataType::QASYMM8_SIGNED }, + static const std::map data_types = { + {"f16", DataType::F16}, + {"f32", DataType::F32}, + {"qasymm8", DataType::QASYMM8}, + {"qasymm8_signed", DataType::QASYMM8_SIGNED}, }; #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED @@ -74,7 +72,7 @@ DataType data_type_from_name(const std::string &name) #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::out_of_range &) + catch (const std::out_of_range &) { ARM_COMPUTE_ERROR_VAR("Invalid data type name: %s", name.c_str()); } diff --git a/src/core/utils/FormatUtils.cpp b/src/core/utils/FormatUtils.cpp index 05b649ded2..46f8455315 100644 --- a/src/core/utils/FormatUtils.cpp +++ b/src/core/utils/FormatUtils.cpp @@ -30,26 +30,16 @@ namespace arm_compute { const std::string &string_from_format(Format format) { - static std::map formats_map = - { - { Format::UNKNOWN, "UNKNOWN" }, - { Format::U8, "U8" }, - { Format::S16, "S16" }, - { Format::U16, "U16" }, - { Format::S32, "S32" }, - { Format::U32, "U32" }, - { Format::F16, "F16" }, - { Format::F32, "F32" }, - { Format::UV88, "UV88" }, - { Format::RGB888, "RGB888" }, - { Format::RGBA8888, "RGBA8888" }, - { Format::YUV444, "YUV444" }, - { Format::YUYV422, "YUYV422" }, - { Format::NV12, "NV12" }, - { Format::NV21, "NV21" }, - { Format::IYUV, "IYUV" }, - { Format::UYVY422, "UYVY422" } - }; + static std::map formats_map = { + {Format::UNKNOWN, "UNKNOWN"}, {Format::U8, "U8"}, + {Format::S16, "S16"}, {Format::U16, "U16"}, + {Format::S32, "S32"}, {Format::U32, "U32"}, + {Format::F16, "F16"}, {Format::F32, "F32"}, + {Format::UV88, "UV88"}, {Format::RGB888, "RGB888"}, + {Format::RGBA8888, "RGBA8888"}, {Format::YUV444, "YUV444"}, + {Format::YUYV422, "YUYV422"}, {Format::NV12, "NV12"}, + {Format::NV21, "NV21"}, {Format::IYUV, "IYUV"}, + {Format::UYVY422, "UYVY422"}}; return formats_map[format]; } diff --git a/src/core/utils/InterpolationPolicyUtils.cpp b/src/core/utils/InterpolationPolicyUtils.cpp index 2d6cabe85e..276e760544 100644 --- a/src/core/utils/InterpolationPolicyUtils.cpp +++ b/src/core/utils/InterpolationPolicyUtils.cpp @@ -29,11 +29,10 @@ namespace arm_compute const std::string &string_from_interpolation_policy(InterpolationPolicy policy) { - static std::map interpolation_policy_map = - { - { InterpolationPolicy::AREA, "AREA" }, - { InterpolationPolicy::BILINEAR, "BILINEAR" }, - { InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR" }, + static std::map interpolation_policy_map = { + {InterpolationPolicy::AREA, "AREA"}, + {InterpolationPolicy::BILINEAR, "BILINEAR"}, + {InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR"}, }; return interpolation_policy_map[policy]; diff --git a/src/core/utils/ScaleUtils.cpp b/src/core/utils/ScaleUtils.cpp index ee57a8e7a7..a92da39b67 100644 --- a/src/core/utils/ScaleUtils.cpp +++ b/src/core/utils/ScaleUtils.cpp @@ -23,11 +23,12 @@ */ #include "src/core/utils/ScaleUtils.h" -#include "src/common/cpuinfo/CpuIsaInfo.h" #include "arm_compute/core/CPP/CPPTypes.h" #include "arm_compute/core/TensorInfo.h" +#include "src/common/cpuinfo/CpuIsaInfo.h" + float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners) { const size_t offset = (align_corners && output_size > 1) ? 1 : 0; @@ -40,13 +41,15 @@ float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t return static_cast(in) / static_cast(out); } -bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout, DataType data_type, - InterpolationPolicy policy, BorderMode border_mode) +bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout, + DataType data_type, + InterpolationPolicy policy, + BorderMode border_mode) { // Do not calculate precomputed weights and indices if kernel code doesn't use them - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { - switch(data_type) + switch (data_type) { case DataType::F32: case DataType::F16: @@ -62,4 +65,4 @@ bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout } return true; -} \ No newline at end of file +} diff --git a/src/core/utils/ScaleUtils.h b/src/core/utils/ScaleUtils.h index 1484824a7f..d8dddc8c70 100644 --- a/src/core/utils/ScaleUtils.h +++ b/src/core/utils/ScaleUtils.h @@ -60,8 +60,11 @@ inline bool is_align_corners_allowed_sampling_policy(SamplingPolicy sampling_pol * * @return True if precomputation is required */ -bool is_precomputation_required(DataLayout data_layout, DataType data_type, InterpolationPolicy policy, BorderMode border_mode); +bool is_precomputation_required(DataLayout data_layout, + DataType data_type, + InterpolationPolicy policy, + BorderMode border_mode); } // namespace scale_utils } // namespace arm_compute -#endif /* UTILS_CORE_SCALEUTILS_H */ \ No newline at end of file +#endif /* UTILS_CORE_SCALEUTILS_H */ diff --git a/src/core/utils/StringUtils.cpp b/src/core/utils/StringUtils.cpp index 6d05c9b64e..bcab0ce10c 100644 --- a/src/core/utils/StringUtils.cpp +++ b/src/core/utils/StringUtils.cpp @@ -55,7 +55,7 @@ std::string float_to_string_with_full_precision(float val) ss.precision(std::numeric_limits::max_digits10); ss << val; - if(val != static_cast(val)) + if (val != static_cast(val)) { ss << "f"; } @@ -65,17 +65,11 @@ std::string float_to_string_with_full_precision(float val) std::string join(const std::vector strings, const std::string &sep) { - if(strings.empty()) + if (strings.empty()) { return ""; } - return std::accumulate( - std::next(strings.begin()), - strings.end(), - strings.at(0), - [&sep](const std::string & a, const std::string & b) - { - return a + sep + b; - }); -} + return std::accumulate(std::next(strings.begin()), strings.end(), strings.at(0), + [&sep](const std::string &a, const std::string &b) { return a + sep + b; }); } +} // namespace arm_compute diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp index 64633c643d..edc8d0eacc 100644 --- a/src/core/utils/helpers/fft.cpp +++ b/src/core/utils/helpers/fft.cpp @@ -37,7 +37,7 @@ std::vector decompose_stages(unsigned int N, const std::set decompose_stages(unsigned int N, const std::set= factor) + if (0 == (res % factor) && res >= factor) { stages.push_back(factor); res /= factor; @@ -57,9 +57,9 @@ std::vector decompose_stages(unsigned int N, const std::set 1) + if (res > 1) { // Couldn't decompose with given factors stages.clear(); @@ -81,8 +81,9 @@ std::vector digit_reverse_indices(unsigned int N, const std::vecto std::vector idx_digit_reverse; // Early exit in case N and fft stages do not match - const float stages_prod = std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies()); - if(stages_prod != N) + const float stages_prod = + std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies()); + if (stages_prod != N) { return idx_digit_reverse; } @@ -94,13 +95,13 @@ std::vector digit_reverse_indices(unsigned int N, const std::vecto unsigned int n_stages = fft_stages.size(); // Scan elements - for(unsigned int n = 0; n < N; ++n) + for (unsigned int n = 0; n < N; ++n) { unsigned int k = n; unsigned int Nx = fft_stages[0]; // Scan stages - for(unsigned int s = 1; s < n_stages; ++s) + for (unsigned int s = 1; s < n_stages; ++s) { // radix of stage i-th unsigned int Ny = fft_stages[s]; diff --git a/src/core/utils/helpers/float_ops.h b/src/core/utils/helpers/float_ops.h index 99e1ea54ee..7f7fbd13bf 100644 --- a/src/core/utils/helpers/float_ops.h +++ b/src/core/utils/helpers/float_ops.h @@ -39,8 +39,7 @@ union RawFloat * * @param[in] val Floating-point value */ - explicit RawFloat(float val) - : f32(val) + explicit RawFloat(float val) : f32(val) { } /** Extract sign of floating point number diff --git a/src/core/utils/helpers/tensor_info.h b/src/core/utils/helpers/tensor_info.h index 9279532e2a..fd4745a453 100644 --- a/src/core/utils/helpers/tensor_info.h +++ b/src/core/utils/helpers/tensor_info.h @@ -41,15 +41,17 @@ namespace tensor_info * @return True if tensors have mismatching quantization info else false. */ template -inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos) +inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, + const ITensorInfo *tensor_info_2, + Ts... tensor_infos) { const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info(); - const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward(tensor_infos)... } }; - return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info) - { - return tensor_info->quantization_info() != first_quantization_info; - }); + const std::array tensor_infos_array{ + {tensor_info_2, std::forward(tensor_infos)...}}; + return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), + [&](const ITensorInfo *tensor_info) + { return tensor_info->quantization_info() != first_quantization_info; }); } } // namespace tensor_info } // namespace helpers diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp index f2216995a9..19d0badd74 100644 --- a/src/core/utils/helpers/tensor_transform.cpp +++ b/src/core/utils/helpers/tensor_transform.cpp @@ -36,10 +36,11 @@ int calculate_stride_on_index(int index, Coordinates strides) return index >= static_cast(strides.num_dimensions()) ? 1 : strides[index]; } -int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask) +int calculate_start_on_index( + TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask) { // Early exit - if(index >= static_cast(starts.num_dimensions())) + if (index >= static_cast(starts.num_dimensions())) { return 0; } @@ -51,14 +52,14 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta int start = starts[index]; // Reset in case of begin mask present - if(arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index)) + if (arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index)) { start = stride > 0 ? std::numeric_limits::lowest() : std::numeric_limits::max(); } // Account negative start points const int dim_size = input_shape[index]; - if(start < 0) + if (start < 0) { start += dim_size; } @@ -69,12 +70,16 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta return start; } -int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index, - Coordinates ends, Coordinates strides, - int32_t end_mask, int32_t shrink_axis_mask) +int calculate_end_on_index(TensorShape input_shape, + int index, + int start_on_index, + Coordinates ends, + Coordinates strides, + int32_t end_mask, + int32_t shrink_axis_mask) { // Early exit - if(index >= static_cast(ends.num_dimensions())) + if (index >= static_cast(ends.num_dimensions())) { return input_shape[index]; } @@ -86,9 +91,9 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde int stop = ends[index]; // Shrink dimension - if(shrink_axis) + if (shrink_axis) { - if(start_on_index == std::numeric_limits::max()) + if (start_on_index == std::numeric_limits::max()) { stop = start_on_index; } @@ -99,14 +104,14 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde } // Reset in case of begin mask present - if(arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis) + if (arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis) { stop = (stride > 0) ? std::numeric_limits::max() : std::numeric_limits::lowest(); } // Account negative end points const int dim_size = input_shape[index]; - if(stop < 0) + if (stop < 0) { stop += dim_size; } @@ -118,14 +123,18 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde } std::tuple calculate_strided_slice_coords(TensorShape input_shape, - Coordinates starts, Coordinates ends, Coordinates strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) + Coordinates starts, + Coordinates ends, + Coordinates strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { Coordinates starts_abs{}; Coordinates ends_abs{}; Coordinates final_strides{}; - for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i) + for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i) { const int start_i = calculate_start_on_index(input_shape, i, starts, strides, begin_mask); starts_abs.set(i, start_i); @@ -136,13 +145,19 @@ std::tuple calculate_strided_slice_coords return std::make_tuple(starts_abs, ends_abs, final_strides); } -TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask, bool return_unshrinked) +TensorShape compute_strided_slice_output_shape(TensorShape input_shape, + Coordinates starts, + Coordinates ends, + Coordinates strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask, + bool return_unshrinked) { unsigned int index = 0; TensorShape output_shape; - for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i) + for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i) { const int stride = calculate_stride_on_index(index, strides); const int start = calculate_start_on_index(input_shape, i, starts, strides, begin_mask); @@ -150,11 +165,11 @@ TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordina const int range = end - start; const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i); - if(return_unshrinked || !is_shrink) + if (return_unshrinked || !is_shrink) { - if((range == 0) || // Zero range - (range < 0 && stride >= 0) || // Negative range with positive stride - (range > 0 && stride <= 0)) // Positive range with negative stride + if ((range == 0) || // Zero range + (range < 0 && stride >= 0) || // Negative range with positive stride + (range > 0 && stride <= 0)) // Positive range with negative stride { output_shape.set(index, 0); return output_shape; @@ -173,9 +188,9 @@ int32_t construct_slice_end_mask(Coordinates ends) { // Create end mask int32_t end_mask = 0; - for(unsigned int i = 0; i < ends.num_dimensions(); ++i) + for (unsigned int i = 0; i < ends.num_dimensions(); ++i) { - if(ends[i] < 0) + if (ends[i] < 0) { end_mask |= 1 << i; } diff --git a/src/core/utils/io/FileHandler.cpp b/src/core/utils/io/FileHandler.cpp index 95fc2e3fa2..d106493238 100644 --- a/src/core/utils/io/FileHandler.cpp +++ b/src/core/utils/io/FileHandler.cpp @@ -21,16 +21,15 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include - #include "arm_compute/core/utils/io/FileHandler.h" #include "arm_compute/core/Error.h" +#include + using namespace arm_compute::io; -FileHandler::FileHandler() - : _filestream(), _filename(" "), _mode() +FileHandler::FileHandler() : _filestream(), _filename(" "), _mode() { } diff --git a/src/core/utils/logging/FilePrinter.cpp b/src/core/utils/logging/FilePrinter.cpp index 55e78f9630..7b4eead38d 100644 --- a/src/core/utils/logging/FilePrinter.cpp +++ b/src/core/utils/logging/FilePrinter.cpp @@ -25,8 +25,7 @@ using namespace arm_compute::logging; -FilePrinter::FilePrinter(const std::string &filename) - : _handler() +FilePrinter::FilePrinter(const std::string &filename) : _handler() { _handler.open(filename, std::fstream::out | std::fstream::trunc); } @@ -34,4 +33,4 @@ FilePrinter::FilePrinter(const std::string &filename) void FilePrinter::print_internal(const std::string &msg) { _handler.stream() << msg << std::endl; -} \ No newline at end of file +} diff --git a/src/core/utils/logging/Helpers.cpp b/src/core/utils/logging/Helpers.cpp index c3df7f6207..14ad910562 100644 --- a/src/core/utils/logging/Helpers.cpp +++ b/src/core/utils/logging/Helpers.cpp @@ -30,13 +30,12 @@ using namespace arm_compute::logging; const std::string &arm_compute::logging::string_from_log_level(LogLevel log_level) { - static std::map log_level_map = - { - { LogLevel::VERBOSE, "VERBOSE" }, - { LogLevel::INFO, "INFO" }, - { LogLevel::WARN, "WARN" }, - { LogLevel::OFF, "OFF" }, + static std::map log_level_map = { + {LogLevel::VERBOSE, "VERBOSE"}, + {LogLevel::INFO, "INFO"}, + {LogLevel::WARN, "WARN"}, + {LogLevel::OFF, "OFF"}, }; return log_level_map[log_level]; -} \ No newline at end of file +} diff --git a/src/core/utils/logging/Logger.cpp b/src/core/utils/logging/Logger.cpp index 70b5868da8..d6681f8179 100644 --- a/src/core/utils/logging/Logger.cpp +++ b/src/core/utils/logging/Logger.cpp @@ -30,10 +30,7 @@ using namespace arm_compute::logging; Logger::Logger(std::string name, LogLevel log_level, std::shared_ptr printer) - : _name(std::move(name)), _log_level(log_level), _printers( -{ - std::move(printer) -}), _decorators() + : _name(std::move(name)), _log_level(log_level), _printers({std::move(printer)}), _decorators() { // Check printer ARM_COMPUTE_ERROR_ON(printer == nullptr); @@ -46,7 +43,7 @@ Logger::Logger(std::string name, LogLevel log_level, std::vectordecorate(msg); } @@ -148,7 +145,7 @@ std::string Logger::create_log_msg(const std::string &str, LogLevel log_level) void Logger::print_all(const std::string &msg) { - for(auto &p : _printers) + for (auto &p : _printers) { p->print(msg); } diff --git a/src/core/utils/logging/LoggerRegistry.cpp b/src/core/utils/logging/LoggerRegistry.cpp index c281d8863c..17015d9ae9 100644 --- a/src/core/utils/logging/LoggerRegistry.cpp +++ b/src/core/utils/logging/LoggerRegistry.cpp @@ -24,15 +24,15 @@ #include "arm_compute/core/utils/logging/LoggerRegistry.h" #include "arm_compute/core/Error.h" + #include "support/Mutex.h" using namespace arm_compute::logging; /** Reserved logger used by the library */ -std::set LoggerRegistry::_reserved_loggers = { "CORE", "RUNTIME", "GRAPH" }; +std::set LoggerRegistry::_reserved_loggers = {"CORE", "RUNTIME", "GRAPH"}; -LoggerRegistry::LoggerRegistry() - : _mtx(), _loggers() +LoggerRegistry::LoggerRegistry() : _mtx(), _loggers() { } @@ -42,10 +42,12 @@ LoggerRegistry &LoggerRegistry::get() return _instance; } -void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, const std::vector> &printers) +void LoggerRegistry::create_logger(const std::string &name, + LogLevel log_level, + const std::vector> &printers) { arm_compute::lock_guard lock(_mtx); - if((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end())) + if ((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end())) { _loggers[name] = std::make_shared(name, log_level, printers); } @@ -54,7 +56,7 @@ void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, void LoggerRegistry::remove_logger(const std::string &name) { arm_compute::lock_guard lock(_mtx); - if(_loggers.find(name) != _loggers.end()) + if (_loggers.find(name) != _loggers.end()) { _loggers.erase(name); } @@ -69,9 +71,9 @@ std::shared_ptr LoggerRegistry::logger(const std::string &name) void LoggerRegistry::create_reserved_loggers(LogLevel log_level, const std::vector> &printers) { arm_compute::lock_guard lock(_mtx); - for(const auto &r : _reserved_loggers) + for (const auto &r : _reserved_loggers) { - if(_loggers.find(r) == _loggers.end()) + if (_loggers.find(r) == _loggers.end()) { _loggers[r] = std::make_shared(r, log_level, printers); } diff --git a/src/core/utils/misc/MMappedFile.cpp b/src/core/utils/misc/MMappedFile.cpp index adae8a2bf0..a467cb3320 100644 --- a/src/core/utils/misc/MMappedFile.cpp +++ b/src/core/utils/misc/MMappedFile.cpp @@ -27,12 +27,11 @@ #include #include -#include - #include #include #include #include +#include #include namespace arm_compute @@ -53,7 +52,7 @@ std::pair get_file_size(const std::string &filename) { struct stat st; // NOLINT memset(&st, 0, sizeof(struct stat)); - if(stat(filename.c_str(), &st) == 0) + if (stat(filename.c_str(), &st) == 0) { return std::make_pair(st.st_size, true); } @@ -73,8 +72,7 @@ size_t get_page_size() } } // namespace -MMappedFile::MMappedFile() - : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr) +MMappedFile::MMappedFile() : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr) { } @@ -92,14 +90,14 @@ MMappedFile::~MMappedFile() bool MMappedFile::map(const std::string &filename, size_t size, size_t offset) { // Check if file is mapped - if(is_mapped()) + if (is_mapped()) { return false; } // Open file _fp = fopen(filename.c_str(), "a+be"); - if(_fp == nullptr) + if (_fp == nullptr) { return false; } @@ -107,26 +105,26 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset) // Extract file descriptor int fd = fileno(_fp); bool status = fd >= 0; - if(status) + if (status) { // Get file size std::tie(_file_size, status) = get_file_size(_filename); - if(status) + if (status) { // Map all file from offset if map size is 0 _map_size = (size == 0) ? _file_size : size; _map_offset = offset; // Check offset mapping - if((_map_offset > _file_size) || (_map_offset % get_page_size() != 0)) + if ((_map_offset > _file_size) || (_map_offset % get_page_size() != 0)) { status = false; } else { // Truncate to file size - if(_map_offset + _map_size > _file_size) + if (_map_offset + _map_size > _file_size) { _map_size = _file_size - _map_offset; } @@ -137,7 +135,7 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset) } } - if(!status) + if (!status) { fclose(_fp); } @@ -148,14 +146,14 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset) void MMappedFile::release() { // Unmap file - if(_data != nullptr) + if (_data != nullptr) { ::munmap(_data, _file_size); _data = nullptr; } // Close file - if(_fp != nullptr) + if (_fp != nullptr) { fclose(_fp); _fp = nullptr; diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp index 086d63b968..f66d3e7064 100644 --- a/src/core/utils/quantization/AsymmHelpers.cpp +++ b/src/core/utils/quantization/AsymmHelpers.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/utils/quantization/AsymmHelpers.h" #include "support/ToolchainSupport.h" @@ -40,7 +42,7 @@ constexpr float epsilon = 0.00001f; Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon) { - if(multiplier >= 1.f) + if (multiplier >= 1.f) { Status status = calculate_quantized_multiplier_greater_than_one(multiplier, quant_multiplier, shift); *shift *= -1; @@ -69,13 +71,13 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, *right_shift = -1 * shift_exp; auto q_fixed = static_cast(support::cpp11::round(q * fixed_point_one_Q0)); ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0); - if(q_fixed == fixed_point_one_Q0) + if (q_fixed == fixed_point_one_Q0) { q_fixed /= 2; --*right_shift; } - if(ignore_epsilon && *right_shift > 31) + if (ignore_epsilon && *right_shift > 31) { *right_shift = 0; q_fixed = 0; @@ -88,9 +90,8 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, return Status{}; } -Status calculate_quantized_multiplier_greater_than_one(float multiplier, - int32_t *quantized_multiplier, - int32_t *left_shift) +Status +calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift) { ARM_COMPUTE_RETURN_ERROR_ON(quantized_multiplier == nullptr); ARM_COMPUTE_RETURN_ERROR_ON(left_shift == nullptr); @@ -101,7 +102,7 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, *left_shift = shift_exp; auto q_fixed = static_cast(support::cpp11::round(q * fixed_point_one_Q0)); ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0); - if(q_fixed == fixed_point_one_Q0) + if (q_fixed == fixed_point_one_Q0) { q_fixed /= 2; ++*left_shift; @@ -113,9 +114,9 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, return Status{}; } -arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info, - const QuantizationInfo &wq_info, - const QuantizationInfo &oq_info, +arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info, + const QuantizationInfo &wq_info, + const QuantizationInfo &oq_info, GEMMLowpOutputStageInfo &stage_info) { ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty()); @@ -133,7 +134,7 @@ arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_i const float i_scale = iq_info.scale().at(0); const float o_scale = oq_info.scale().at(0); - for(unsigned int i = 0; i < size; ++i) + for (unsigned int i = 0; i < size; ++i) { const float multiplier = i_scale * w_scales[i] / o_scale; int32_t quant_multiplier = 0; @@ -154,7 +155,7 @@ std::pair get_min_max_values_from_quantized_data_type(DataType data_ty { int min_quant_val = 0; int max_quant_val = 0; - switch(data_type) + switch (data_type) { case DataType::QASYMM8: min_quant_val = std::numeric_limits::min(); @@ -179,7 +180,9 @@ std::pair get_min_max_values_from_quantized_data_type(DataType data_ty return std::make_pair(min_quant_val, max_quant_val); } -std::tuple get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type) +std::tuple get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, + const ActivationLayerInfo &act_info, + DataType data_type) { ARM_COMPUTE_ERROR_ON(data_type != DataType::QASYMM8 && data_type != DataType::QASYMM8_SIGNED); @@ -190,20 +193,23 @@ std::tuple get_quantized_asymmetric_output_min_max(const Quant const UniformQuantizationInfo q_unif = q_info.uniform(); - if(act_info.enabled()) + if (act_info.enabled()) { - switch(act_info.activation()) + switch (act_info.activation()) { case ActivationLayerInfo::ActivationFunction::RELU: type_min = q_unif.offset; break; case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: type_min = q_unif.offset; - type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info); + type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) + : quantize_qasymm8_signed(act_info.a(), q_info); break; case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info) : quantize_qasymm8_signed(act_info.b(), q_info); - type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info); + type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info) + : quantize_qasymm8_signed(act_info.b(), q_info); + type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) + : quantize_qasymm8_signed(act_info.a(), q_info); break; default: ARM_COMPUTE_ERROR("Activation function not supported."); @@ -226,7 +232,7 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const unsigned int num_filters = wq_info.scale().size(); - for(unsigned int i = 0; i < num_filters; ++i) + for (unsigned int i = 0; i < num_filters; ++i) { int32_t output_multiplier = 0; int32_t output_shift = 0; @@ -267,11 +273,11 @@ int32_t multiply_by_quantized_multiplier(int32_t input, int32_t qmul, int32_t sh int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v) { - if(exponent == 0) + if (exponent == 0) { return v; } - else if(exponent < 0) + else if (exponent < 0) { return rounding_divide_by_pow2(v, -exponent); } @@ -291,11 +297,14 @@ int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v) } } -void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift) +void get_invsqrt_quantized_multiplier_exp(int32_t input, + int32_t reverse_shift, + int32_t &output_inv_sqrt, + int32_t &output_shift) { ARM_COMPUTE_ERROR_ON(input < 0); - if(input <= 1) + if (input <= 1) { // dealing the inputs (0 and 1) separately to avoid overflow output_inv_sqrt = std::numeric_limits::max(); @@ -305,7 +314,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, // prepare input for fixed point operation and compute shift value output_shift = 11; - while(input >= (1 << 29)) + while (input >= (1 << 29)) { input /= 4; ++output_shift; @@ -334,9 +343,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, // multiplication of two fixed point numbers, defined for readability auto fixed_point_mul = [](FixedPointRawType a, FixedPointRawType b) -> FixedPointRawType - { - return saturating_rounding_doubling_highmul(a, b); - }; + { return saturating_rounding_doubling_highmul(a, b); }; // rescaling of fixed point to have dst_bit integer bits, defined for readability auto fixed_point_rescale = [](FixedPointRawType a, uint32_t src_bit, uint32_t dst_bit) -> FixedPointRawType @@ -347,17 +354,18 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, // 5 iterations of Newton-Raphson method for inverse square root - 1.5 * x_n = input/2 * (x_n)^3 constexpr int32_t num_iteration = 5; - for(int32_t i = 0; i < num_iteration; ++i) + for (int32_t i = 0; i < num_iteration; ++i) { const auto x3 = fixed_point_rescale(fixed_point_mul(fixed_point_mul(x, x), x), 9, fixedpoint_position); - x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), 6, fixedpoint_position); + x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), + 6, fixedpoint_position); } // fixed point representation of sqrt(1/2) const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250; x = fixed_point_mul(fixedpoint_half_sqrt_2, x); output_inv_sqrt = x; - if(output_shift < 0) + if (output_shift < 0) { output_inv_sqrt <<= -output_shift; output_shift = 0; @@ -365,5 +373,5 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, // convert right shift to left shift output_shift *= reverse_shift; } -} // quantization -} // arm_compute +} // namespace quantization +} // namespace arm_compute diff --git a/src/core/utils/quantization/AsymmHelpers.h b/src/core/utils/quantization/AsymmHelpers.h index f9701095cb..5dc607ce58 100644 --- a/src/core/utils/quantization/AsymmHelpers.h +++ b/src/core/utils/quantization/AsymmHelpers.h @@ -29,7 +29,8 @@ namespace arm_compute { -namespace quantization { +namespace quantization +{ /** Get minimum and maximum output of the activation function after quantization. * @@ -41,7 +42,9 @@ namespace quantization { * * @return The minimum and maximum output of the activation function after quantization. */ -std::tuple get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type); +std::tuple get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, + const ActivationLayerInfo &act_info, + DataType data_type); } // namespace quantization } // namespace arm_compute diff --git a/src/cpu/CpuContext.cpp b/src/cpu/CpuContext.cpp index 7c14891ef8..b745af8229 100644 --- a/src/cpu/CpuContext.cpp +++ b/src/cpu/CpuContext.cpp @@ -24,6 +24,7 @@ #include "src/cpu/CpuContext.h" #include "arm_compute/core/CPP/CPPTypes.h" + #include "src/cpu/CpuQueue.h" #include "src/cpu/CpuTensor.h" @@ -32,7 +33,7 @@ #include #if defined(_WIN64) -#define posix_memalign _aligned_realloc +#define posix_memalign _aligned_realloc #define posix_memalign_free _aligned_free #endif // defined(_WIN64) #endif // !defined(__APPLE__) && !defined(__OpenBSD__) @@ -66,7 +67,7 @@ void *default_aligned_allocate(void *user_data, size_t size, size_t alignment) size_t real_size = (rem) ? (size + alignment - rem) : size; ptr = memalign(alignment, real_size); #else /* defined(BARE_METAL) */ - if(posix_memalign(&ptr, alignment, size) != 0) + if (posix_memalign(&ptr, alignment, size) != 0) { // posix_memalign returns non-zero on failures, the return values will be // - EINVAL: wrong alignment @@ -81,17 +82,13 @@ void default_aligned_free(void *user_data, void *ptr) ARM_COMPUTE_UNUSED(user_data); free(ptr); } -static AclAllocator default_allocator = { &default_allocate, - &default_free, - &default_aligned_allocate, - &default_aligned_free, - nullptr - }; +static AclAllocator default_allocator = {&default_allocate, &default_free, &default_aligned_allocate, + &default_aligned_free, nullptr}; AllocatorWrapper populate_allocator(AclAllocator *external_allocator) { bool is_valid = (external_allocator != nullptr); - if(is_valid) + if (is_valid) { is_valid = is_valid && (external_allocator->alloc != nullptr); is_valid = is_valid && (external_allocator->free != nullptr); @@ -123,14 +120,13 @@ cpuinfo::CpuIsaInfo populate_capabilities_flags(AclTargetCapabilities external_c return isa_caps; } -CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps, - int32_t max_threads) +CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps, int32_t max_threads) { CpuCapabilities caps; // Populate capabilities with system information caps.cpu_info = cpuinfo::CpuInfo::build(); - if(external_caps != AclCpuCapabilitiesAuto) + if (external_caps != AclCpuCapabilitiesAuto) { cpuinfo::CpuIsaInfo isa = populate_capabilities_flags(external_caps); auto cpus = caps.cpu_info.cpus(); @@ -151,11 +147,9 @@ CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps, } // namespace CpuContext::CpuContext(const AclContextOptions *options) - : IContext(Target::Cpu), - _allocator(default_allocator), - _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1)) + : IContext(Target::Cpu), _allocator(default_allocator), _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1)) { - if(options != nullptr) + if (options != nullptr) { _allocator = populate_allocator(options->allocator); _caps = populate_capabilities(options->capabilities, options->max_compute_units); @@ -175,7 +169,7 @@ AllocatorWrapper &CpuContext::allocator() ITensorV2 *CpuContext::create_tensor(const AclTensorDescriptor &desc, bool allocate) { CpuTensor *tensor = new CpuTensor(this, desc); - if(tensor != nullptr && allocate) + if (tensor != nullptr && allocate) { tensor->allocate(); } diff --git a/src/cpu/CpuContext.h b/src/cpu/CpuContext.h index da241ed097..0c8ae49f49 100644 --- a/src/cpu/CpuContext.h +++ b/src/cpu/CpuContext.h @@ -25,8 +25,8 @@ #define SRC_CPU_CPUCONTEXT_H #include "src/common/AllocatorWrapper.h" -#include "src/common/IContext.h" #include "src/common/cpuinfo/CpuInfo.h" +#include "src/common/IContext.h" namespace arm_compute { @@ -36,7 +36,7 @@ namespace cpu struct CpuCapabilities { cpuinfo::CpuInfo cpu_info{}; - int32_t max_threads{ -1 }; + int32_t max_threads{-1}; }; /** CPU context implementation class */ @@ -60,9 +60,9 @@ public: AllocatorWrapper &allocator(); // Inherrited methods overridden - ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override; - IQueue *create_queue(const AclQueueOptions *options) override; - std::tuple create_activation(const AclTensorDescriptor &src, + ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override; + IQueue *create_queue(const AclQueueOptions *options) override; + std::tuple create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) override; @@ -74,4 +74,4 @@ private: } // namespace cpu } // namespace arm_compute -#endif /* SRC_CPU_CPUCONTEXT_H */ \ No newline at end of file +#endif /* SRC_CPU_CPUCONTEXT_H */ diff --git a/src/cpu/CpuQueue.cpp b/src/cpu/CpuQueue.cpp index 0f0097b3f4..be781d6794 100644 --- a/src/cpu/CpuQueue.cpp +++ b/src/cpu/CpuQueue.cpp @@ -29,8 +29,7 @@ namespace arm_compute { namespace cpu { -CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options) - : IQueue(ctx) +CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx) { ARM_COMPUTE_UNUSED(options); } diff --git a/src/cpu/CpuQueue.h b/src/cpu/CpuQueue.h index 871a36c85b..b6a2be0e23 100644 --- a/src/cpu/CpuQueue.h +++ b/src/cpu/CpuQueue.h @@ -24,10 +24,10 @@ #ifndef SRC_CPU_CPUQUEUE_H #define SRC_CPU_CPUQUEUE_H -#include "src/common/IQueue.h" - #include "arm_compute/runtime/IScheduler.h" +#include "src/common/IQueue.h" + namespace arm_compute { namespace cpu diff --git a/src/cpu/CpuTensor.cpp b/src/cpu/CpuTensor.cpp index 6dd6d9c31b..59082b5350 100644 --- a/src/cpu/CpuTensor.cpp +++ b/src/cpu/CpuTensor.cpp @@ -29,8 +29,7 @@ namespace arm_compute { namespace cpu { -CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc) - : ITensorV2(ctx), _legacy_tensor() +CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor() { ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::Cpu)); _legacy_tensor = std::make_unique(); @@ -41,7 +40,7 @@ void *CpuTensor::map() { ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr); - if(_legacy_tensor == nullptr) + if (_legacy_tensor == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("[CpuTensor:map]: Backing tensor does not exist!"); return nullptr; diff --git a/src/cpu/CpuTensor.h b/src/cpu/CpuTensor.h index b078774c99..89931e1f94 100644 --- a/src/cpu/CpuTensor.h +++ b/src/cpu/CpuTensor.h @@ -24,10 +24,10 @@ #ifndef SRC_CPU_CPUTENSOR_H #define SRC_CPU_CPUTENSOR_H -#include "src/common/ITensorV2.h" - #include "arm_compute/runtime/Tensor.h" +#include "src/common/ITensorV2.h" + namespace arm_compute { namespace cpu @@ -52,7 +52,7 @@ public: void *map() override; StatusCode unmap() override; arm_compute::ITensor *tensor() const override; - StatusCode import(void *handle, ImportMemoryType type) override; + StatusCode import(void *handle, ImportMemoryType type) override; private: std::unique_ptr _legacy_tensor; @@ -60,4 +60,4 @@ private: } // namespace cpu } // namespace arm_compute -#endif /* SRC_CPU_CPUTENSOR_H */ \ No newline at end of file +#endif /* SRC_CPU_CPUTENSOR_H */ diff --git a/src/cpu/CpuTypes.h b/src/cpu/CpuTypes.h index 0f7b9b6552..8726bc470a 100644 --- a/src/cpu/CpuTypes.h +++ b/src/cpu/CpuTypes.h @@ -31,6 +31,6 @@ namespace arm_compute typedef __fp16 float16_t; #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC typedef float float32_t; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_CPUTYPES */ diff --git a/src/cpu/ICpuKernel.h b/src/cpu/ICpuKernel.h index 8f4106240d..bcd0cb2c70 100644 --- a/src/cpu/ICpuKernel.h +++ b/src/cpu/ICpuKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_ICPUKERNEL_H #include "arm_compute/core/CPP/ICPPKernel.h" + #include "src/cpu/kernels/CpuKernelSelectionTypes.h" namespace arm_compute @@ -34,7 +35,7 @@ namespace cpu enum class KernelSelectionType { Preferred, /**< Retrieve the best implementation available for the given Cpu ISA, ignoring the build flags */ - Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */ + Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */ }; template @@ -50,13 +51,15 @@ public: */ template - static const auto *get_implementation(const SelectorType &selector, KernelSelectionType selection_type = KernelSelectionType::Supported) + static const auto *get_implementation(const SelectorType &selector, + KernelSelectionType selection_type = KernelSelectionType::Supported) { - using kernel_type = typename std::remove_reference::type::value_type; + using kernel_type = + typename std::remove_reference::type::value_type; - for(const auto &uk : Derived::get_available_kernels()) + for (const auto &uk : Derived::get_available_kernels()) { - if(uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr)) + if (uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr)) { return &uk; } diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp index f4bd4e6cad..50bf672d3c 100644 --- a/src/cpu/kernels/CpuActivationKernel.cpp +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -26,11 +26,11 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" #include "src/cpu/kernels/activation/list.h" #include @@ -43,126 +43,126 @@ namespace kernels { namespace { -static const std::vector available_kernels = -{ +static const std::vector available_kernels = { #ifdef ARM_COMPUTE_ENABLE_SVE - { - "sve2_q8_activation_lut", - [](const ActivationDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.cpumodel == CPUModel::A510 && data.isa.sve2; }, - REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut) - }, + {"sve2_q8_activation_lut", + [](const ActivationDataTypeISASelectorData &data) + { + return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && + data.cpumodel == CPUModel::A510 && data.isa.sve2; + }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)}, #endif // ARM_COMPUTE_ENABLE_SVE #ifdef __aarch64__ - { - // Neon LUT implementantion takes precedence - "neon_q8_activation_lut", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut) - }, + {// Neon LUT implementantion takes precedence + "neon_q8_activation_lut", + [](const ActivationDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)}, #endif // __aarch64__ - { - "sve2_qu8_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, - REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation) - }, - { - "sve2_qs8_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, - REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation) - }, - { - "sve2_qs16_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, - REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation) - }, - { - "sve_fp16_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, - REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation) - }, - { - "sve_fp32_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, - REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation) - }, - { - "neon_fp16_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation) - }, - { - "neon_fp32_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation) - }, - { - "neon_qu8_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation) - }, - { - "neon_qs8_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation) - }, - { - "neon_qs16_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16; }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation) - }, + {"sve2_qu8_activation", + [](const ActivationDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8 && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)}, + {"sve2_qs8_activation", + [](const ActivationDataTypeISASelectorData &data) + { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)}, + {"sve2_qs16_activation", + [](const ActivationDataTypeISASelectorData &data) { + return data.dt == DataType::QSYMM16 && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)}, + {"sve_fp16_activation", + [](const ActivationDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)}, + {"sve_fp32_activation", + [](const ActivationDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, + REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)}, + {"neon_fp16_activation", + [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)}, + {"neon_fp32_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)}, + {"neon_qu8_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)}, + {"neon_qs8_activation", + [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)}, + {"neon_qs16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QSYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)}, }; /* Supported activation in the 8-bit integer domain */ -static const std::array qasymm8_activations = -{ - ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LOGISTIC, - ActivationLayerInfo::ActivationFunction::TANH, - ActivationLayerInfo::ActivationFunction::HARD_SWISH, - ActivationLayerInfo::ActivationFunction::LEAKY_RELU, - ActivationLayerInfo::ActivationFunction::GELU, +static const std::array qasymm8_activations = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC, + ActivationLayerInfo::ActivationFunction::TANH, ActivationLayerInfo::ActivationFunction::HARD_SWISH, + ActivationLayerInfo::ActivationFunction::LEAKY_RELU, ActivationLayerInfo::ActivationFunction::GELU, }; /* Supported activation in the 16-bit integer domain */ -static const std::array qsymm16_activations = -{ - ActivationLayerInfo::ActivationFunction::LOGISTIC, - ActivationLayerInfo::ActivationFunction::TANH, - ActivationLayerInfo::ActivationFunction::HARD_SWISH, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU -}; +static const std::array qsymm16_activations = { + ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH, + ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::QSYMM16, DataType::F16, DataType::F32); - const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() }); + const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ + src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - const DataType data_type = src->data_type(); - const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); - const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation(); + const DataType data_type = src->data_type(); + const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); + const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)), - "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_data_type_quantized_asymmetric(data_type) && + (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == + std::end(qasymm8_activations)), + "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && + (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), + f_act) == std::end(qsymm16_activations)), "For QSYMM16 only tanh and logistic are supported"); - ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) - && (oq_info != QuantizationInfo(1.f / 128.f, 128))); - ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - && (oq_info != QuantizationInfo(1.f / 256.f, 0))); - - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128))); - - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 128.f, 128))); + ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 256.f, 0))); + + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 128.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 256.f, -128))); + + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 32768.f, 0))); // Checks performed when dst is configured - if((dst != nullptr) && (dst->total_size() != 0)) + if ((dst != nullptr) && (dst->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); @@ -176,7 +176,7 @@ std::pair validate_and_configure_window(const ITensorInfo *src, // Configure kernel window Window win = calculate_max_window(*src, Steps()); - if(dst != nullptr) + if (dst != nullptr) { // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, *src->clone()); @@ -185,14 +185,19 @@ std::pair validate_and_configure_window(const ITensorInfo *src, return std::make_pair(Status{}, win); } #ifdef __aarch64__ -void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_type, - const UniformQuantizationInfo &qi_in, const UniformQuantizationInfo &qi_out, - ActivationLayerInfo::LookupTable256 &lut, float a, float b) +void init_lut(ActivationLayerInfo::ActivationFunction act_func, + DataType data_type, + const UniformQuantizationInfo &qi_in, + const UniformQuantizationInfo &qi_out, + ActivationLayerInfo::LookupTable256 &lut, + float a, + float b) { - for(size_t i = 0; i < lut.size(); ++i) + for (size_t i = 0; i < lut.size(); ++i) { - float tmp_f = (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in); - switch(act_func) + float tmp_f = + (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in); + switch (act_func) { case ActivationLayerInfo::ActivationFunction::HARD_SWISH: tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); @@ -246,7 +251,8 @@ void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_ty tmp_f = 0; break; } - lut[i] = (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out); + lut[i] = + (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out); } } #endif // __aarch64__ @@ -258,8 +264,9 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info)); - const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() }); - if(dst != nullptr) + const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ + src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()}); + if (dst != nullptr) { // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, *src->clone()); @@ -271,11 +278,12 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac _name = std::string("CpuActivationKernel").append("/").append(uk->name); #ifdef __aarch64__ - if(src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) + if (src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) { ActivationLayerInfo::LookupTable256 tmp_lut; - init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(), (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(), - tmp_lut, activation_info.a(), activation_info.b()); + init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(), + (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(), tmp_lut, + activation_info.a(), activation_info.b()); activation_info.setLookupTable256(tmp_lut); } #endif // __aarch64__ @@ -288,11 +296,13 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac ICPPKernel::configure(win); } -Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status +CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first); return Status{}; } @@ -302,7 +312,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count ARM_COMPUTE_UNUSED(thread_count); ARM_COMPUTE_UNUSED(platform); - if(_split_dimension == Window::DimX) + if (_split_dimension == Window::DimX) { // Don't split the work load too small if the tensor has been reinterpreted as 1D. // This number is loosely chosen as threading overhead in each platform varies wildly. @@ -314,7 +324,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { // Early exit on disabled activation - if(!_act_info.enabled()) + if (!_act_info.enabled()) { return; } diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h index 804407653f..4bad9fb3e8 100644 --- a/src/cpu/kernels/CpuActivationKernel.h +++ b/src/cpu/kernels/CpuActivationKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -38,7 +39,8 @@ namespace kernels class CpuActivationKernel : public ICpuKernel { private: - using ActivationKernelPtr = std::add_pointer::type; + using ActivationKernelPtr = + std::add_pointer::type; public: CpuActivationKernel() = default; @@ -71,7 +73,7 @@ public: size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; /** Get the preferred dimension in which the scheduler splits the work into multiple jobs. @@ -94,8 +96,8 @@ public: private: ActivationLayerInfo _act_info{}; - ActivationKernelPtr _run_method{ nullptr }; - size_t _split_dimension{ Window::DimY }; + ActivationKernelPtr _run_method{nullptr}; + size_t _split_dimension{Window::DimY}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp index 2983575cb6..a990aa4715 100644 --- a/src/cpu/kernels/CpuAddKernel.cpp +++ b/src/cpu/kernels/CpuAddKernel.cpp @@ -26,19 +26,21 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/add/list.h" + #include #if defined(ENABLE_FP32_KERNELS) namespace { - static constexpr size_t default_mws_N1_fp32_neon = 24536; - static constexpr size_t default_mws_V1_fp32_neon = 40510; -} +static constexpr size_t default_mws_N1_fp32_neon = 24536; +static constexpr size_t default_mws_V1_fp32_neon = 40510; +} // namespace #endif /* ENABLE_FP32_KERNELS */ namespace arm_compute @@ -49,152 +51,82 @@ namespace kernels { namespace { -static const std::vector available_kernels = -{ - { - "neon_qu8_add_fixedpoint", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; - }, - REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint) - }, - { - "neon_qs8_add_fixedpoint", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint; - }, - REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint) - }, - { - "sve2_qu8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::QASYMM8) && data.isa.sve2; - }, - REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2) - }, - { - "sve2_qs8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; - }, - REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2) - }, - { - "sve2_qs16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::QSYMM16) && data.isa.sve2; - }, - REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2) - }, - { - "sve_fp32_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::F32) && data.isa.sve; - }, - REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve) - }, - { - "sve_fp16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; - }, - REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve) - }, - { - "sve_u8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::U8) && data.isa.sve; - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve) - }, - { - "sve_s16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::S16) && data.isa.sve; - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve) - }, - { - "sve_s32_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::S32) && data.isa.sve; - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve) - }, - { - "neon_fp32_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon) - }, - { - "neon_fp16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::F16) && data.isa.fp16; - }, - REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon) - }, - { - "neon_u8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon) - }, - { - "neon_s16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon) - }, - { - "neon_s32_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon) - }, - { - "neon_qu8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon) - }, - { - "neon_qs8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon) - }, - { - "neon_qs16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon) - } -}; - -Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) +static const std::vector available_kernels = { + {"neon_qu8_add_fixedpoint", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; }, + REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint)}, + {"neon_qs8_add_fixedpoint", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint; }, + REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint)}, + {"sve2_qu8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)}, + {"sve2_qs8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)}, + {"sve2_qs16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16) && data.isa.sve2; }, + REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)}, + {"sve_fp32_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, + REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)}, + {"sve_fp16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, + REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)}, + {"sve_u8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8) && data.isa.sve; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)}, + {"sve_s16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16) && data.isa.sve; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)}, + {"sve_s32_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32) && data.isa.sve; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)}, + {"neon_fp32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)}, + {"neon_fp16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)}, + {"neon_u8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)}, + {"neon_s16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)}, + {"neon_s32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)}, + {"neon_qu8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)}, + {"neon_qs8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)}, + {"neon_qs16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)}}; + +Status +validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) { ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::F16, DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) - || (src1.data_type() != dst.data_type())), - "Broadcasting across width is supported on configurations where all tensors have the same data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (src0.tensor_shape().x() != src1.tensor_shape().x()) && + ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) || + (src1.data_type() != dst.data_type())), + "Broadcasting across width is supported on configurations where all tensors have the same data type"); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), @@ -202,8 +134,8 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons } const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(&src0, &src1, &dst); - const auto uk = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{ src0.data_type(), - CPUInfo::get().get_isa(), can_use_fixedpoint }); + const auto uk = CpuAddKernel::get_implementation( + CpuAddKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; @@ -215,9 +147,9 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); - const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst); - const auto uk = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{ src0->data_type(), - CPUInfo::get().get_isa(), can_use_fixedpoint }); + const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst); + const auto uk = CpuAddKernel::get_implementation( + CpuAddKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); @@ -237,7 +169,8 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I ICpuKernel::configure(win); } -Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) +Status +CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); @@ -277,14 +210,14 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const ARM_COMPUTE_UNUSED(thread_count); #if defined(ENABLE_FP32_KERNELS) - if(this->_run_method == &add_fp32_neon) + if (this->_run_method == &add_fp32_neon) { size_t mws = ICPPKernel::default_mws; - if(platform.get_cpu_model() == CPUModel::N1) + if (platform.get_cpu_model() == CPUModel::N1) { mws = default_mws_N1_fp32_neon; } - else if(platform.get_cpu_model() == CPUModel::V1) + else if (platform.get_cpu_model() == CPUModel::V1) { mws = default_mws_V1_fp32_neon; } @@ -294,7 +227,7 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const } // tensor is 1D or was re-interpreted as 1D - if(this->window().shape().num_dimensions() == 1) + if (this->window().shape().num_dimensions() == 1) { return mws; } @@ -307,7 +240,7 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const return std::max(static_cast(1), mws); } } -#else /* ENABLE_FP32_KERNELS */ +#else /* ENABLE_FP32_KERNELS */ ARM_COMPUTE_UNUSED(platform); #endif /* ENABLE_FP32_KERNELS */ return ICPPKernel::default_mws; diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h index 9921feabe2..4adba8bb16 100644 --- a/src/cpu/kernels/CpuAddKernel.h +++ b/src/cpu/kernels/CpuAddKernel.h @@ -37,7 +37,8 @@ namespace kernels class CpuAddKernel : public ICpuKernel { private: - using AddKernelPtr = std::add_pointer::type; + using AddKernelPtr = std::add_pointer::type; public: struct AddKernel @@ -74,10 +75,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); + static Status + validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; /** Return minimum workload size of the relevant kernel @@ -98,9 +100,9 @@ public: private: ConvertPolicy _policy{}; - AddKernelPtr _run_method{ nullptr }; + AddKernelPtr _run_method{nullptr}; std::string _name{}; - size_t _split_dimension{ Window::DimY }; + size_t _split_dimension{Window::DimY}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuAddMulAddKernel.cpp b/src/cpu/kernels/CpuAddMulAddKernel.cpp index b84bdd54e9..6a632e8702 100644 --- a/src/cpu/kernels/CpuAddMulAddKernel.cpp +++ b/src/cpu/kernels/CpuAddMulAddKernel.cpp @@ -27,8 +27,8 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/addmuladd/list.h" @@ -41,36 +41,28 @@ namespace kernels { namespace { -static const std::vector available_kernels = -{ +static const std::vector available_kernels = { #ifdef __aarch64__ - { - "neon_fp32_add_mul_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon) - }, - { - "neon_fp16_add_mul_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); }, - REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon) - }, - { - "neon_qasymm8_add_mul_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon) - }, - { - "neon_qasymm8_signed_add_mul_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon) - } + {"neon_fp32_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)}, + {"neon_fp16_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16); }, + REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)}, + {"neon_qasymm8_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)}, + {"neon_qasymm8_signed_add_mul_add", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)} #endif // __aarch64__ }; -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status validate_arguments(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output); @@ -78,16 +70,16 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, using ActFunction = ActivationLayerInfo::ActivationFunction; const ActFunction act_func = act_info.activation(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - (act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU && act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY), - "Only RELU Family activations, or no activation, is supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU && + act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY), + "Only RELU Family activations, or no activation, is supported"); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - if(is_data_type_quantized(input1->data_type())) + if (is_data_type_quantized(input1->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_mul, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_add, 1, DataType::F32); @@ -101,39 +93,47 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2); // No broadcasting ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mul, bn_add); ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->num_dimensions() != 1, "BatchNorm coefficients should be 1D array"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0], "First dimensions of inputs and batchNorm coefs should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0], + "First dimensions of inputs and batchNorm coefs should match"); // Validate in case we have add layer's output (intermediate) initialized - if(add_output != nullptr && add_output->total_size() > 0) + if (add_output != nullptr && add_output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, add_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, add_output); } // Validate in case final output has been initialized - if(final_output->total_size() > 0) + if (final_output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, final_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, final_output); } - const auto uk = CpuAddMulAddKernel::get_implementation(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuAddMulAddKernel::get_implementation( + DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; } } // namespace -void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - ITensorInfo *add_output, ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CpuAddMulAddKernel::configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(bn_mul, bn_add, input2); ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, bn_add, bn_mul, final_output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info)); - const auto uk = CpuAddMulAddKernel::get_implementation(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuAddMulAddKernel::get_implementation( + DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); @@ -146,7 +146,7 @@ void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo set_shape_if_empty(*final_output, input1->tensor_shape()); set_data_type_if_unknown(*final_output, input1->data_type()); - if(add_output != nullptr) + if (add_output != nullptr) { set_shape_if_empty(*add_output, input1->tensor_shape()); set_data_type_if_unknown(*add_output, input1->data_type()); @@ -158,14 +158,19 @@ void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo ICpuKernel::configure(win); } -Status CpuAddMulAddKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CpuAddMulAddKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info)); return Status{}; } diff --git a/src/cpu/kernels/CpuAddMulAddKernel.h b/src/cpu/kernels/CpuAddMulAddKernel.h index 67ce6f029a..c5e31ec291 100644 --- a/src/cpu/kernels/CpuAddMulAddKernel.h +++ b/src/cpu/kernels/CpuAddMulAddKernel.h @@ -26,6 +26,7 @@ #define SRC_CPU_KERNELS_CPUADDMULADDKERNEL #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -39,8 +40,15 @@ namespace kernels class CpuAddMulAddKernel : public ICpuKernel { private: - using AddMulAddKernelPtr = - std::add_pointer::type; + using AddMulAddKernelPtr = std::add_pointer::type; public: struct AddMulAddKernel @@ -57,23 +65,31 @@ public: * Similar to @ref NEAddMulAdd::configure() * */ - void configure(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - ITensorInfo *add_output, ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info); + void configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuAddMulAddKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; static const std::vector &get_available_kernels(); @@ -81,7 +97,7 @@ public: private: ConvertPolicy _policy{}; ActivationLayerInfo _act_info{}; - AddMulAddKernelPtr _run_method{ nullptr }; + AddMulAddKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp index 764a1ec71c..05c7742b03 100644 --- a/src/cpu/kernels/CpuCastKernel.cpp +++ b/src/cpu/kernels/CpuCastKernel.cpp @@ -28,16 +28,16 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/common/Registrars.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/SaturateCast.h" - #include "src/cpu/kernels/cast/list.h" +#include "support/SaturateCast.h" namespace arm_compute { @@ -47,38 +47,30 @@ namespace kernels { namespace { -static const std::vector available_kernels = -{ - { - "neon_qs8_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast) - }, - { - "neon_qu8_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast) - }, - { - "neon_u8_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast) - }, - { - "neon_fp16_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast) - }, - { - "neon_fp32_to_fp16_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast) - }, - { - "neon_s32_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast) - }, +static const std::vector available_kernels = { + {"neon_qs8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)}, + {"neon_qu8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)}, + {"neon_u8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)}, + {"neon_fp16_cast", + [](const CastDataTypeISASelectorData &data) { return data.src_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)}, + {"neon_fp32_to_fp16_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)}, + {"neon_s32_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)}, }; Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) @@ -88,57 +80,67 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON(src == dst); #ifdef __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32, DataType::S32, DataType::S64, DataType::U64); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::U32, DataType::S32, DataType::F32, DataType::S64); #else // __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::U32, DataType::S32, DataType::F32); #endif // __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 - && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 && + dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32), "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32), "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && + (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32), "Only data_types supported [in] U16 -> [out] U8, U32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32), "Only data_types supported [in] S16 -> [out] U8, S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::U8 - && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::U8 && + dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32), "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::F16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8), "Only data_types supported [in] F32 -> [out] QASYMM8, F16, S32, U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::F16 - && dst->data_type() != DataType::F32 - && dst->data_type() != DataType::U8 - && dst->data_type() != DataType::S64), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8 && + dst->data_type() != DataType::S64), "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8, S64"); #ifdef __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32, @@ -149,7 +151,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver #endif // __aarch64__ // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } @@ -193,15 +195,8 @@ inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr) template <> inline void internal_neon_convert(const int32_t *src_ptr, int64_t *dst_ptr) { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr), - vld1q_s32(src_ptr + 4), - vld1q_s32(src_ptr + 8), - vld1q_s32(src_ptr + 12) - } - }; + const int32x4x4_t texels = { + {vld1q_s32(src_ptr), vld1q_s32(src_ptr + 4), vld1q_s32(src_ptr + 8), vld1q_s32(src_ptr + 12)}}; vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0]))); vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0]))); vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1]))); @@ -215,33 +210,14 @@ inline void internal_neon_convert(const int32_t *src_ptr, int6 template <> inline void internal_neon_convert(const int64_t *src_ptr, float *dst_ptr) { - const float64x2x4_t texels0 = - { - { - vcvtq_f64_s64(vld1q_s64(src_ptr)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 2)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 6)) - } - }; - const float64x2x4_t texels1 = - { - { - vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 10)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 14)) - } - }; - const float32x4x4_t texels = - { - { - vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), - vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), - vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), - vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3])) - } - }; + const float64x2x4_t texels0 = {{vcvtq_f64_s64(vld1q_s64(src_ptr)), vcvtq_f64_s64(vld1q_s64(src_ptr + 2)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), vcvtq_f64_s64(vld1q_s64(src_ptr + 6))}}; + const float64x2x4_t texels1 = {{vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), vcvtq_f64_s64(vld1q_s64(src_ptr + 10)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), vcvtq_f64_s64(vld1q_s64(src_ptr + 14))}}; + const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), + vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), + vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), + vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}}; vst1q_f32(dst_ptr, texels.val[0]); vst1q_f32(dst_ptr + 4, texels.val[1]); vst1q_f32(dst_ptr + 8, texels.val[2]); @@ -251,34 +227,15 @@ inline void internal_neon_convert(const int64_t *src_ptr, float template <> inline void internal_neon_convert(const uint64_t *src_ptr, float *dst_ptr) { - const float64x2x4_t texels0 = - { - { - vcvtq_f64_u64(vld1q_u64(src_ptr)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 2)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 6)) - } - }; - const float64x2x4_t texels1 = - { - { - vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 10)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 14)) - } - }; + const float64x2x4_t texels0 = {{vcvtq_f64_u64(vld1q_u64(src_ptr)), vcvtq_f64_u64(vld1q_u64(src_ptr + 2)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), vcvtq_f64_u64(vld1q_u64(src_ptr + 6))}}; + const float64x2x4_t texels1 = {{vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), vcvtq_f64_u64(vld1q_u64(src_ptr + 10)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), vcvtq_f64_u64(vld1q_u64(src_ptr + 14))}}; - const float32x4x4_t texels = - { - { - vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), - vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), - vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), - vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3])) - } - }; + const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), + vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), + vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), + vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}}; vst1q_f32(dst_ptr, texels.val[0]); vst1q_f32(dst_ptr + 4, texels.val[1]); @@ -287,23 +244,26 @@ inline void internal_neon_convert(const uint64_t *src_ptr, floa } template -inline void convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x) +inline void +convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - internal_neon_convert(src_ptr + x, dst_ptr + x); - } - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + internal_neon_convert(src_ptr + x, dst_ptr + x); + } + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); } } // namespace #endif // __aarch64__ @@ -325,21 +285,22 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); /*ukernel runs only when using fp16, so we validate it isn't a nullptr only before using it */ - const auto *uk = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ _src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = CpuCastKernel::get_implementation( + CastDataTypeISASelectorData{_src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa()}); - switch(_src->info()->data_type()) + switch (_src->info()->data_type()) { #ifdef __aarch64__ case DataType::U64: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::F32: { @@ -353,7 +314,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr } case DataType::S64: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::F32: { @@ -369,111 +330,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::QASYMM8_SIGNED: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::S16: { /* Up-conversion QASYMM8_SIGNED -> S16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + int x = window_start_x; - const int16x8x2_t texels = + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - vst1q_s16(dst_ptr + x, texels.val[0]); - vst1q_s16(dst_ptr + x + 8, texels.val[1]); - } + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s16(dst_ptr + x, texels.val[0]); + vst1q_s16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::S32: { /* Up-conversion QASYMM8_SIGNED -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + int x = window_start_x; - const int16x8x2_t texels = + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); - } + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F32: { /* Up-conversion QASYMM8_SIGNED -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F16: @@ -492,111 +444,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::QASYMM8: case DataType::U8: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::S16: { /* Up-conversion U8 -> S16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - vst1q_s16(dst_ptr + x, texels.val[0]); - vst1q_s16(dst_ptr + x + 8, texels.val[1]); - } + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s16(dst_ptr + x, texels.val[0]); + vst1q_s16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::S32: { /* Up-conversion U8 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); - } + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F32: { /* Up-conversion U8 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F16: @@ -609,35 +552,32 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::U16: { /* Up-conversion U8 -> U16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); - const uint16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_u8(vget_low_u8(texels_u8)), - vmovl_u8(vget_high_u8(texels_u8)) - } - }; + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - vst1q_u16(dst_ptr + x, texels.val[0]); - vst1q_u16(dst_ptr + x + 8, texels.val[1]); - } + const uint16x8x2_t texels = { + {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + vst1q_u16(dst_ptr + x, texels.val[0]); + vst1q_u16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -647,177 +587,154 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr } case DataType::S16: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::QASYMM8_SIGNED: { /* Down-conversion S16 -> QASYMM8_SIGNED */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1]))); - } + vst1q_s8(dst_ptr + x, + vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1]))); - } + vst1q_s8(dst_ptr + x, + vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); } break; } case DataType::U8: { /* Down-conversion S16 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); - } + vst1q_u8(dst_ptr + x, + vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; - - vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), - vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); - } + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), + vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); } break; } case DataType::S32: { /* Up-conversion S16 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = - { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); - const int32x4x4_t texels_s32 = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s16(vget_low_s16(texels.val[0])), - vmovl_s16(vget_high_s16(texels.val[0])), - vmovl_s16(vget_low_s16(texels.val[1])), - vmovl_s16(vget_high_s16(texels.val[1])) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_s32(dst_ptr + x, texels_s32.val[0]); - vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]); - vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]); - vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]); - } + const int32x4x4_t texels_s32 = { + {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])), + vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s32(dst_ptr + x, texels_s32.val[0]); + vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]); + vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]); + vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -828,104 +745,92 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::U16: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::U8: { /* Down-conversion U16 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint16x8x2_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); - } + vst1q_u8(dst_ptr + x, + vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint16x8x2_t texels = - { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); - vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } + vst1q_u8(dst_ptr + x, + vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); + } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); } break; } case DataType::U32: { /* Up-conversion U16 -> U32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint16x8x2_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; - - vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0]))); - vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0]))); - vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1]))); - vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1]))); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; - }, - src, dst); + vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0]))); + vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0]))); + vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1]))); + vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1]))); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -941,7 +846,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr break; } case DataType::F32: - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::F16: { @@ -953,105 +858,110 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::S32: { /* Conversion F32 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float32x4x4_t texels = {{ vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12), - } - }; + }}; - vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0])); - vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1])); - vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2])); - vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3])); - } + vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0])); + vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1])); + vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2])); + vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3])); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8: case DataType::U8: { /* Down-conversion F32 -> U8 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float32x4x4_t texels = {{ vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12), - } - }; - - vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1]))))); - vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3]))))); - } + }}; + + vst1_u8(dst_ptr + x, + vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), + vqmovun_s32(vcvtq_s32_f32(texels.val[1]))))); + vst1_u8(dst_ptr + x + 8, + vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), + vqmovun_s32(vcvtq_s32_f32(texels.val[3]))))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8_SIGNED: { /* Down-conversion F32 -> QASYMM8_SIGNED */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float32x4x4_t texels = {{ vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12), - } - }; - - vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1]))))); - vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3]))))); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); + }}; + + vst1_s8(dst_ptr + x, + vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), + vqmovn_s32(vcvtq_s32_f32(texels.val[1]))))); + vst1_s8(dst_ptr + x + 8, + vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), + vqmovn_s32(vcvtq_s32_f32(texels.val[3]))))); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + } + }, + src, dst); break; } @@ -1060,7 +970,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr } break; case DataType::S32: - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { #if __aarch64__ case DataType::S64: @@ -1079,104 +989,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::F32: { /* Conversion S32 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int32x4x4_t texels = {{ vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), vld1q_s32(src_ptr + x + 8), vld1q_s32(src_ptr + x + 12), - } - }; + }}; - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0])); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1])); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2])); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3])); - } + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0])); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1])); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2])); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3])); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8_SIGNED: { /* Down-conversion S32 -> QASYMM8_SIGNED */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int32x4x4_t texels = {{ vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), vld1q_s32(src_ptr + x + 8), vld1q_s32(src_ptr + x + 12), - } - }; - vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1])))); - vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3])))); - } + }}; + vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), + vqmovn_s32(texels.val[1])))); + vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), + vqmovn_s32(texels.val[3])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); - vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1])))); - vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3])))); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + + vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), + vmovn_s32(texels.val[1])))); + vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), + vmovn_s32(texels.val[3])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); } break; } @@ -1184,68 +1092,66 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::U8: { /* Down-conversion S32 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; - vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1])))); - vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3])))); - } + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), + vqmovun_s32(texels.val[1])))); + vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), + vqmovun_s32(texels.val[3])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); - vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1]))))); - vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3]))))); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + + vst1_u8(dst_ptr + x, + vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), + vmovn_u32(vreinterpretq_u32_s32(texels.val[1]))))); + vst1_u8(dst_ptr + x + 8, + vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), + vmovn_u32(vreinterpretq_u32_s32(texels.val[3]))))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); } break; } diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h index a7e6417ff2..ddbfe1f034 100644 --- a/src/cpu/kernels/CpuCastKernel.h +++ b/src/cpu/kernels/CpuCastKernel.h @@ -40,7 +40,8 @@ namespace kernels class CpuCastKernel : public ICpuKernel { private: - using CastKernelPtr = std::add_pointer::type; + using CastKernelPtr = + std::add_pointer::type; public: CpuCastKernel() = default; @@ -76,7 +77,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct CastKernel @@ -89,7 +90,7 @@ public: static const std::vector &get_available_kernels(); private: - ConvertPolicy _policy{ ConvertPolicy::SATURATE }; + ConvertPolicy _policy{ConvertPolicy::SATURATE}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuCol2ImKernel.cpp b/src/cpu/kernels/CpuCol2ImKernel.cpp index bf5a44d78b..a52a1f58ea 100644 --- a/src/cpu/kernels/CpuCol2ImKernel.cpp +++ b/src/cpu/kernels/CpuCol2ImKernel.cpp @@ -29,8 +29,9 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -49,9 +50,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); // Validate configured output - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, false)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + compute_col2im_shape(*src, convolved_dims, false)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); } @@ -106,13 +108,16 @@ void CpuCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, const T Iterator in(src, window); Iterator out(dst, window_out); - execute_window_loop(window, [&](const Coordinates & id) - { - const int hidx = id.y(); - const int idx = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + (hidx % _convolved_dims.width) * output_stride_x; - std::memcpy(out.ptr() + idx, in.ptr(), el_size); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int hidx = id.y(); + const int idx = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + + (hidx % _convolved_dims.width) * output_stride_x; + std::memcpy(out.ptr() + idx, in.ptr(), el_size); + }, + in, out); } const char *CpuCol2ImKernel::name() const @@ -121,4 +126,4 @@ const char *CpuCol2ImKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h index deafcc14df..3e394ac914 100644 --- a/src/cpu/kernels/CpuCol2ImKernel.h +++ b/src/cpu/kernels/CpuCol2ImKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_COL2IM_KERNEL_H #include "arm_compute/core/Size2D.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -75,7 +76,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp index 29d40f0e52..8c290173e8 100644 --- a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp +++ b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp @@ -30,10 +30,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -50,13 +51,14 @@ void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, c uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); // Offset dst - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + batch_offset * dst->info()->strides_in_bytes()[3]; + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + batch_offset * dst->info()->strides_in_bytes()[3]; const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); const int window_step_x = 16 / dst->info()->element_size(); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1)); @@ -66,66 +68,74 @@ void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, c const DataType dt = src->info()->data_type(); const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); - } - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr, + vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); } } @@ -154,7 +164,7 @@ void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int b _func = nullptr; _batch_offset = batch_offset; - switch(src->data_type()) + switch (src->data_type()) { case DataType::S8: case DataType::U8: @@ -196,9 +206,7 @@ void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &windo ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), - tensors.get_tensor(TensorType::ACL_DST), - _batch_offset, + (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _batch_offset, window); } diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h index 0de68a5d64..52ea553a7d 100644 --- a/src/cpu/kernels/CpuConcatenateBatchKernel.h +++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h @@ -57,15 +57,15 @@ public: static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &); private: - BatchConcatFunction *_func{ nullptr }; - unsigned int _batch_offset{ 0 }; + BatchConcatFunction *_func{nullptr}; + unsigned int _batch_offset{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp index ebc5322aee..c75e1e4477 100644 --- a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp +++ b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp @@ -30,11 +30,12 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" #include @@ -53,13 +54,14 @@ void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, c uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); // Offset destination - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + depth_offset * dst->info()->strides_in_bytes()[2]; + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + depth_offset * dst->info()->strides_in_bytes()[2]; const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); const int window_step_x = 16 / dst->info()->element_size(); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1)); @@ -69,64 +71,73 @@ void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, c const DataType dt = src->info()->data_type(); const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, + vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, + vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); } } @@ -134,7 +145,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, c { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX)); @@ -154,7 +166,7 @@ void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int d _func = nullptr; _depth_offset = depth_offset; - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: _func = &depth_concat; @@ -192,9 +204,7 @@ void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &windo ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), - tensors.get_tensor(TensorType::ACL_DST), - _depth_offset, + (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _depth_offset, window); } diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h index 5a0edb95bb..54de9aff46 100644 --- a/src/cpu/kernels/CpuConcatenateDepthKernel.h +++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h @@ -65,15 +65,15 @@ public: static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &); private: - DepthConcatFunction *_func{ nullptr }; - unsigned int _depth_offset{ 0 }; + DepthConcatFunction *_func{nullptr}; + unsigned int _depth_offset{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp index 47a2b44443..b6c11d948b 100644 --- a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp +++ b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp @@ -30,10 +30,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include @@ -53,7 +54,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY)); - for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); } @@ -91,13 +92,14 @@ void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &wind auto dst = tensors.get_tensor(TensorType::ACL_DST); // Offset destination pointer to the correct position - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY]; + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + _height_offset * dst->info()->strides_in_bytes()[Window::DimY]; const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()) * static_cast(dst->info()->element_size()); const int window_step_x = 16; - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1)); @@ -108,64 +110,74 @@ void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &wind const DataType dt = src->info()->data_type(); const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - - }, - src_it, dst_it); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_u8(dst_ptr + dst_it.offset() + x, + vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - vst1q_s8(reinterpret_cast(dst_ptr + dst_it.offset() + x), - vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast(src_it.ptr()) + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_s8( + reinterpret_cast(dst_ptr + dst_it.offset() + x), + vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast(src_it.ptr()) + x), src_qinfo), + dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = src_it.ptr(); - const auto out_ptr = dst_ptr + dst_it.offset(); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); + const auto in_ptr = src_it.ptr(); + const auto out_ptr = dst_ptr + dst_it.offset(); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); } } diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h index 74d5d0c2c3..df880c4878 100644 --- a/src/cpu/kernels/CpuConcatenateHeightKernel.h +++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h @@ -58,11 +58,11 @@ public: static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - unsigned int _height_offset{ 0 }; + unsigned int _height_offset{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp index f00b37a01b..f6100cccca 100644 --- a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp +++ b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp @@ -24,12 +24,12 @@ #include "src/cpu/kernels/CpuConcatenateWidthKernel.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Steps.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Steps.h" #include "arm_compute/core/Validate.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" namespace arm_compute { @@ -47,7 +47,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, con ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0)); - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); } @@ -86,13 +86,14 @@ void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &windo auto dst = tensors.get_tensor(TensorType::ACL_DST); // Offset output pointer to the correct position - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0]; + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + _width_offset * dst->info()->strides_in_bytes()[0]; const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()) * static_cast(dst->info()->element_size()); constexpr int window_step_x = 16; - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); // Create iterators @@ -101,62 +102,73 @@ void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &windo const DataType dt = src->info()->data_type(); const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_u8(dst_ptr + dst_it.offset() + x, + vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - vst1q_s8(reinterpret_cast(dst_ptr + dst_it.offset() + x), - vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast(src_it.ptr() + x)), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_s8( + reinterpret_cast(dst_ptr + dst_it.offset() + x), + vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast(src_it.ptr() + x)), src_qinfo), + dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = src_it.ptr(); - const auto out_ptr = dst_ptr + dst_it.offset(); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); + const auto in_ptr = src_it.ptr(); + const auto out_ptr = dst_ptr + dst_it.offset(); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); } } diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h index 418bc51b33..560e44e35a 100644 --- a/src/cpu/kernels/CpuConcatenateWidthKernel.h +++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h @@ -58,11 +58,11 @@ public: static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - unsigned int _width_offset{ 0 }; + unsigned int _width_offset{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp index 08b39deef2..87703ec631 100644 --- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp +++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -34,8 +35,10 @@ namespace cpu { namespace kernels { -void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, - DataLayout data_layout) +void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -43,7 +46,8 @@ void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, IT // Output tensor auto initialisation if not yet initialized auto_init_if_empty(*dst, *src->clone()); - ARM_COMPUTE_ERROR_THROW_ON(CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout)); const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW; @@ -62,8 +66,10 @@ void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, IT ICpuKernel::configure(win); } -Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, - DataLayout data_layout) +Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); @@ -72,7 +78,7 @@ Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, c ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN); // Checks performed when dst is configured - if((dst != nullptr) && (dst->total_size() != 0)) + if ((dst != nullptr) && (dst->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -97,11 +103,15 @@ void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const W Iterator input(src, window); Iterator output(dst, window); - execute_window_loop(window, [&](const Coordinates & id) - { - memcpy(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, input.ptr(), element_size); - }, - input); + execute_window_loop( + window, + [&](const Coordinates &id) + { + memcpy(output.ptr() + id.x() * dst_stride_x + + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, + input.ptr(), element_size); + }, + input); } const char *CpuConvertFullyConnectedWeightsKernel::name() const diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h index 9a1393323b..2253889e69 100644 --- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h +++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h @@ -53,24 +53,32 @@ public: * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout); + void configure(const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuConvertFullyConnectedWeightsKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - unsigned int _factor1{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */ - unsigned int _factor2{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */ + unsigned int _factor1{ + 0}; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */ + unsigned int _factor2{ + 0}; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */ }; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp index 1005d001ab..745b1566c2 100644 --- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp +++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp @@ -29,9 +29,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); // Validate output if initialized - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape()); @@ -60,11 +61,11 @@ std::pair validate_and_configure_window(const ITensorInfo *src, { // Output auto inizialitation if not yet initialized { - const bool is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED; - const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); + const bool is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED; + const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); const int offset_correction = is_input_signed ? -128 : 128; - const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction); + const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction); auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo)); } @@ -110,27 +111,29 @@ void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Win const uint8_t mask = 128; const auto vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{}); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - const uint8_t in = *(reinterpret_cast(input_ptr + x)); - *(output_ptr + x) = in ^ mask; - } - }, - input, output); + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const uint8_t in = *(reinterpret_cast(input_ptr + x)); + *(output_ptr + x) = in ^ mask; + } + }, + input, output); } const char *CpuConvertQuantizedSignednessKernel::name() const diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h index b5eaf65487..e94d3d5ef2 100644 --- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h +++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h @@ -54,7 +54,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuCopyKernel.cpp b/src/cpu/kernels/CpuCopyKernel.cpp index 3f0f3fe422..1b693d7a3a 100644 --- a/src/cpu/kernels/CpuCopyKernel.cpp +++ b/src/cpu/kernels/CpuCopyKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -48,9 +49,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4); // Validate destination if initialized - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); } @@ -64,7 +66,8 @@ std::pair validate_and_configure_window(const ITensorInfo *src, return std::make_pair(Status{}, calculate_max_window(*dst)); } -std::pair validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding) +std::pair +validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding) { const TensorShape src_shape = src->tensor_shape(); const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding); @@ -84,7 +87,7 @@ void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pa _padding = padding; std::pair win_config; - if(padding.empty()) + if (padding.empty()) { win_config = validate_and_configure_window(src, dst); } @@ -97,17 +100,20 @@ void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pa ICpuKernel::configure(win_config.second); } -Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, const PaddingList &padding) +Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, + const arm_compute::ITensorInfo *dst, + const PaddingList &padding) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding)); - if(padding.empty()) + if (padding.empty()) { ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); } else { - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first); } return Status{}; @@ -122,38 +128,41 @@ void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const Thr const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - if(_padding.empty()) + if (_padding.empty()) { - Window dst_window{ window }; - dst_window.set(Window::DimX, Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0))); + Window dst_window{window}; + dst_window.set(Window::DimX, + Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0))); Window out_slice = dst_window.first_slice_window_1D(); do { Iterator src_it(src, out_slice); Iterator dst_it(dst, out_slice); - execute_window_loop(out_slice, [&](const Coordinates &) - { - memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); - }, - src_it, dst_it); - } - while(dst_window.slide_window_slice_1D(out_slice)); + execute_window_loop( + out_slice, + [&](const Coordinates &) + { memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); }, + src_it, dst_it); + } while (dst_window.slide_window_slice_1D(out_slice)); } else { - Window src_window{ window }; - src_window.set(Window::DimX, Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0))); + Window src_window{window}; + src_window.set(Window::DimX, + Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0))); Iterator src_it(src, src_window); Iterator dst_it(dst, window); const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size(); - execute_window_loop(window, [&](const Coordinates &) - { - auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size(); - std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes); - }, - src_it, dst_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size(); + std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes); + }, + src_it, dst_it); } } diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h index c9ef8eba76..a05053f07e 100644 --- a/src/cpu/kernels/CpuCopyKernel.h +++ b/src/cpu/kernels/CpuCopyKernel.h @@ -55,7 +55,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList()); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp index d6c56d2012..82e3a5ce00 100644 --- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp +++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/traits.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/traits.h" #include "src/cpu/kernels/depthwiseconv2d/list.h" namespace arm_compute @@ -41,72 +42,53 @@ namespace kernels { namespace { -static const std::vector available_kernels = -{ - { - "neon_qu8_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::QASYMM8); - }, - REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative) - }, - { - "neon_qs8_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::QASYMM8_SIGNED); - }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative) - }, - { - "neon_fp16_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::F16 && data.isa.fp16); - }, - REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative) - }, - { - "neon_fp32_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::F32); - }, - REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative) - }, - { - "neon_qp8_qu8_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8); - }, - REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative) - }, - { - "neon_qp8_qs8_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8); - }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative) - }, +static const std::vector available_kernels = { + {"neon_qu8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)}, + {"neon_qs8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)}, + {"neon_fp16_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::F16 && data.isa.fp16); }, + REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)}, + {"neon_fp32_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)}, + {"neon_qp8_qu8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)}, + {"neon_qp8_qs8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8); }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)}, }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > + src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > + src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom()); ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1)); - ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1)); + ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || + (info.pad_stride_info.stride().second < 1)); - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); @@ -116,12 +98,12 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); } - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0)); - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -131,9 +113,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + const TensorShape output_shape = + misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); } @@ -142,7 +125,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } } // namespace -void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) +void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info)); @@ -151,18 +138,26 @@ void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITe _conv_info = info; const auto uk = CpuDepthwiseConv2dNativeKernel::get_implementation( - DepthwiseConv2dNativeDataTypeISASelectorData{ weights->data_type(), src->data_type(), CPUInfo::get().get_isa() }); + DepthwiseConv2dNativeDataTypeISASelectorData{weights->data_type(), src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); _func = uk->ukernel; const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); - auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info())); + auto_init_if_empty(*dst, src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(output_shape) + .set_quantization_info(dst->quantization_info())); Window win = calculate_max_window(*dst, Steps()); ICpuKernel::configure(win); } -Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info)); return Status{}; @@ -187,7 +182,8 @@ const char *CpuDepthwiseConv2dNativeKernel::name() const return "CpuDepthwiseConv2dNativeKernel"; } -const std::vector &CpuDepthwiseConv2dNativeKernel::get_available_kernels() +const std::vector & +CpuDepthwiseConv2dNativeKernel::get_available_kernels() { return available_kernels; } diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h index 9fabd0b01c..7e78f52e13 100644 --- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h +++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/misc/Traits.h" #include "arm_compute/function_info/ConvolutionInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" #include "support/AclRequires.h" @@ -44,8 +45,9 @@ namespace kernels class CpuDepthwiseConv2dNativeKernel : public ICpuKernel { private: - using DepthwiseConv2dNativeKernelPtr = - std::add_pointer::type; + using DepthwiseConv2dNativeKernelPtr = std::add_pointer:: + type; public: CpuDepthwiseConv2dNativeKernel() = default; @@ -64,17 +66,25 @@ public: * @param[in] info Depthwise convolution meta-data. * */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDepthwiseConv2dNativeKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct DepthwiseConv2dNativeKernel { @@ -89,9 +99,9 @@ private: * * @param[in] window Region on which to execute the kernel. */ - DepthwiseConv2dNativeKernelPtr _func{ nullptr }; + DepthwiseConv2dNativeKernelPtr _func{nullptr}; ConvolutionInfo _conv_info{}; - bool _has_biases{ false }; + bool _has_biases{false}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuDequantizeKernel.cpp b/src/cpu/kernels/CpuDequantizeKernel.cpp index a2d24f9243..d17128b5ac 100644 --- a/src/cpu/kernels/CpuDequantizeKernel.cpp +++ b/src/cpu/kernels/CpuDequantizeKernel.cpp @@ -28,12 +28,13 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NESymm.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" #include @@ -48,9 +49,11 @@ namespace Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, + DataType::QSYMM16); - if(dst->tensor_shape().total_size() > 0) + if (dst->tensor_shape().total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32); @@ -124,28 +127,30 @@ void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Win Iterator in(input, win_collapsed); Iterator out(output, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale, offset); + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); - store_result(reinterpret_cast(out_ptr + x), vdeq); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale, offset); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - auto val = *(in_ptr + x); - *(out_ptr + x) = static_cast(Qasymm8QuantizationHelper::dequantize(val, qinfo)); - } - }, - in, out); + store_result(reinterpret_cast(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto val = *(in_ptr + x); + *(out_ptr + x) = static_cast(Qasymm8QuantizationHelper::dequantize(val, qinfo)); + } + }, + in, out); } template @@ -165,28 +170,30 @@ void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *o Iterator in(input, win); Iterator out(output, win); - execute_window_loop(win, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale[id.z()]); + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); - store_result(reinterpret_cast(out_ptr + x), vdeq); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale[id.z()]); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast(dequantize(val, scale[id.z()])); - } - }, - in, out); + store_result(reinterpret_cast(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast(dequantize(val, scale[id.z()])); + } + }, + in, out); } template @@ -206,37 +213,34 @@ void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *o Iterator in(input, win); Iterator out(output, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t vscale = + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], - scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7], - scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11], - scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15] - } - }; - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, vscale); - - store_result(reinterpret_cast(out_ptr + x), vdeq); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast(dequantize(val, scale[x])); - } - }, - in, out); + const float32x4x4_t vscale = {{scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], scale[x + 4], + scale[x + 5], scale[x + 6], scale[x + 7], scale[x + 8], scale[x + 9], + scale[x + 10], scale[x + 11], scale[x + 12], scale[x + 13], + scale[x + 14], scale[x + 15]}}; + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, vscale); + + store_result(reinterpret_cast(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast(dequantize(val, scale[x])); + } + }, + in, out); } template @@ -257,28 +261,30 @@ void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Wind Iterator in(input, win_collapsed); Iterator out(output, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale); + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); - store_result(reinterpret_cast(out_ptr + x), vdeq); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast(dequantize(val, scale)); - } - }, - in, out); + store_result(reinterpret_cast(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast(dequantize(val, scale)); + } + }, + in, out); } template @@ -299,34 +305,36 @@ void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Win Iterator in(input, win_collapsed); Iterator out(output, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize_int16(vin, scale); + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); - store_result(reinterpret_cast(out_ptr + x), vdeq); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize_int16(vin, scale); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int16_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast(dequantize_qsymm16(val, scale)); - } - }, - in, out); + store_result(reinterpret_cast(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int16_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast(dequantize_qsymm16(val, scale)); + } + }, + in, out); } template void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) { - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: run_dequantization_qasymm8(input, output, window); @@ -335,7 +343,9 @@ void run_dequantization_core(const ITensor *input, ITensor *output, const Window run_dequantization_qasymm8(input, output, window); break; case DataType::QSYMM8_PER_CHANNEL: - input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc(input, output, window) : run_dequantization_qsymm8_per_channel_nchw(input, output, window); + input->info()->data_layout() == DataLayout::NHWC + ? run_dequantization_qsymm8_per_channel_nhwc(input, output, window) + : run_dequantization_qsymm8_per_channel_nchw(input, output, window); break; case DataType::QSYMM8: run_dequantization_qsymm8(input, output, window); @@ -377,7 +387,7 @@ void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, con const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - switch(dst->info()->data_type()) + switch (dst->info()->data_type()) { case DataType::F32: run_dequantization_core(src, dst, window); diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h index cfa991dc74..6ed58587c9 100644 --- a/src/cpu/kernels/CpuDequantizeKernel.h +++ b/src/cpu/kernels/CpuDequantizeKernel.h @@ -54,7 +54,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp index a4cdddee5e..4cb0fb1c40 100644 --- a/src/cpu/kernels/CpuDirectConv2dKernel.cpp +++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp @@ -22,13 +22,14 @@ * SOFTWARE. */ #include "src/cpu/kernels/CpuDirectConv2dKernel.h" -#include "src/cpu/kernels/directconv2d/list.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/directconv2d/list.h" using namespace arm_compute::detail; @@ -38,26 +39,25 @@ namespace cpu { namespace kernels { -static const std::vector available_kernels = -{ - { - "neon_fp32_nhwc_directconv2d", - [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; }, - REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d) - }, - { - "neon_fp32_nchw_directconv2d", - [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; }, - REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d) - }, - { - "neon_fp16_nchw_directconv2d", - [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d) - }, +static const std::vector available_kernels = { + {"neon_fp32_nhwc_directconv2d", + [](const DataTypeDataLayoutISASelectorData &data) + { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; }, + REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)}, + {"neon_fp32_nchw_directconv2d", + [](const DataTypeDataLayoutISASelectorData &data) + { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; }, + REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)}, + {"neon_fp16_nchw_directconv2d", + [](const DataTypeDataLayoutISASelectorData &data) + { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)}, }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); @@ -76,7 +76,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32); ARM_COMPUTE_UNUSED(width_idx); // Checks performed when output is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); @@ -100,11 +100,15 @@ std::pair validate_and_configure_window(ITensorInfo *src, ITenso // Configure window without any padding win = calculate_max_window(*dst, Steps()); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } -void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info) +void CpuDirectConv2dKernel::configure(ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *dst, + const PadStrideInfo &conv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); @@ -129,12 +133,13 @@ void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, IT ICpuKernel::configure(win_config.second); } -Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info) +Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), - dst->clone().get()) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); return Status{}; } @@ -149,7 +154,8 @@ void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, c auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto dst = tensors.get_tensor(TensorType::ACL_DST); - const auto *uk = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ src->info()->data_type(), _data_layout, CPUInfo::get().get_isa() }); + const auto *uk = CpuDirectConv2dKernel::get_implementation( + DataTypeDataLayoutISASelectorData{src->info()->data_type(), _data_layout, CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr); uk->ukernel(window, src, weights, dst, _conv_info); diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h index b9265dc630..ad4caea193 100644 --- a/src/cpu/kernels/CpuDirectConv2dKernel.h +++ b/src/cpu/kernels/CpuDirectConv2dKernel.h @@ -37,7 +37,8 @@ namespace kernels class CpuDirectConv2dKernel : public ICpuKernel { private: - using DirectConv2dKernel_Ptr = std::add_pointer::type; + using DirectConv2dKernel_Ptr = std::add_pointer::type; public: CpuDirectConv2dKernel() = default; @@ -64,10 +65,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct DirectConv2dKernel @@ -81,8 +85,8 @@ public: private: PadStrideInfo _conv_info{}; - unsigned int _kernel_size{ 0 }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; + unsigned int _kernel_size{0}; + DataLayout _data_layout{DataLayout::UNKNOWN}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp index 93ad5e5eba..d4af8bedaf 100644 --- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp +++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp @@ -27,15 +27,16 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" #include #include @@ -49,7 +50,9 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, const DirectConvolutionLayerOutputStageKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -57,22 +60,23 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL))); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index( + src->data_layout(), DataLayoutDimension::CHANNEL))); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); } - if(src->data_type() == DataType::S32) + if (src->data_type() == DataType::S32) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output"); } // Checks performed when output is configured - if((dst != nullptr) && (dst->total_size() != 0)) + if ((dst != nullptr) && (dst->total_size() != 0)) { - if(is_data_type_float(src->data_type())) + if (is_data_type_float(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); } @@ -82,10 +86,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } - else if(src->data_type() == DataType::S32) + else if (src->data_type() == DataType::S32) { // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo - ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && + (info.output_data_type != DataType::QASYMM8_SIGNED)); } return Status{}; @@ -93,8 +98,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const template typename std::enable_if::value, void>::type -output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +output_stage_nchw(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) { const bool has_bias = bias != nullptr; /** SIMD vector tag type. */ @@ -113,50 +123,57 @@ output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITens Iterator in(src, win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates & id) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast(in.ptr()) + x; - auto v_in = wrapper::vloadq(in_ptr); - - // Accumulate bias - if(has_bias) + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const auto vb = wrapper::vdup_n(*reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{}); - v_in = wrapper::vadd(v_in, vb); - } + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast(in.ptr()) + x; + auto v_in = wrapper::vloadq(in_ptr); - const auto out_ptr = reinterpret_cast(out.ptr()) + x; - wrapper::vstore(out_ptr, v_in); - } + // Accumulate bias + if (has_bias) + { + const auto vb = wrapper::vdup_n( + *reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{}); + v_in = wrapper::vadd(v_in, vb); + } - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - auto s_in = *(reinterpret_cast(in.ptr()) + x); + const auto out_ptr = reinterpret_cast(out.ptr()) + x; + wrapper::vstore(out_ptr, v_in); + } - // Accumulate bias - if(has_bias) + // Left-overs loop + for (; x < window_end_x; ++x) { - const auto b = *reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))); - s_in += b; - } + // Get bias and pointer to input + auto s_in = *(reinterpret_cast(in.ptr()) + x); - *(reinterpret_cast(out.ptr()) + x) = s_in; - } + // Accumulate bias + if (has_bias) + { + const auto b = *reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))); + s_in += b; + } - }, - in, out); + *(reinterpret_cast(out.ptr()) + x) = s_in; + } + }, + in, out); } template typename std::enable_if::value, void>::type -output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +output_stage_nhwc(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) { const bool has_bias = bias != nullptr; ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); @@ -179,50 +196,59 @@ output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITens Iterator bi(bias, window_bias); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast(in.ptr()); - auto v_in = wrapper::vloadq(in_ptr + x); - - // Accumulate bias - if(has_bias) + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; - v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr)); - } + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast(in.ptr()); + auto v_in = wrapper::vloadq(in_ptr + x); - const auto out_ptr = reinterpret_cast(out.ptr()); - wrapper::vstore(out_ptr + x, v_in); - } + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; + v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr)); + } - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - auto s_in = *(reinterpret_cast(in.ptr()) + x); + const auto out_ptr = reinterpret_cast(out.ptr()); + wrapper::vstore(out_ptr + x, v_in); + } - // Accumulate bias - if(has_bias) + // Left-overs loop + for (; x < window_end_x; ++x) { - const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; - s_in += *bias_ptr; - } + // Get bias and pointer to input + auto s_in = *(reinterpret_cast(in.ptr()) + x); - const auto out_ptr = reinterpret_cast(out.ptr()); - *(out_ptr + x) = s_in; - } - }, - in, bi, out); + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; + s_in += *bias_ptr; + } + + const auto out_ptr = reinterpret_cast(out.ptr()); + *(out_ptr + x) = s_in; + } + }, + in, bi, out); } // Quantized case -template < typename TOut, typename std::enable_if < std::is_same::value || std::is_same::value, int >::type = 0 > -void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +template < + typename TOut, + typename std::enable_if::value || std::is_same::value, int>::type = 0> +void output_stage_nchw(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) { const bool has_bias = bias != nullptr; using VectorType = typename wrapper::traits::neon_bitvector_t; @@ -242,67 +268,63 @@ void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, Iterator in(src, win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates & id) - { - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast(in.ptr()) + x; - int32x4x4_t v_in = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast(in.ptr()) + x; + int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8), + wrapper::vloadq(in_ptr + 12)}}; + + // Accumulate bias + if (has_bias) { - wrapper::vloadq(in_ptr), - wrapper::vloadq(in_ptr + 4), - wrapper::vloadq(in_ptr + 8), - wrapper::vloadq(in_ptr + 12) + const auto vb = wrapper::vdup_n( + *reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))), TagType{}); + v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb), + wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}}; } - }; - // Accumulate bias - if(has_bias) - { - const auto vb = wrapper::vdup_n(*reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))), TagType{}); - v_in = - { - { - wrapper::vadd(v_in.val[0], vb), - wrapper::vadd(v_in.val[1], vb), - wrapper::vadd(v_in.val[2], vb), - wrapper::vadd(v_in.val[3], vb) - } - }; + const auto out_ptr = reinterpret_cast(out.ptr()) + x; + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift_s32, min, max, false)); } - const auto out_ptr = reinterpret_cast(out.ptr()) + x; - wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, - min, max, false)); - } + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + int32_t s_in = *(reinterpret_cast(in.ptr()) + x); - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - int32_t s_in = *(reinterpret_cast(in.ptr()) + x); + // Accumulate bias + if (has_bias) + { + const auto b = *reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))); + s_in += b; + } - // Accumulate bias - if(has_bias) - { - const auto b = *reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))); - s_in += b; + const auto out_ptr = reinterpret_cast(out.ptr()) + x; + *out_ptr = + finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, + std::numeric_limits::lowest(), std::numeric_limits::max(), false); } - - const auto out_ptr = reinterpret_cast(out.ptr()) + x; - *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, - std::numeric_limits::lowest(), std::numeric_limits::max(), false); - } - }, - in, out); + }, + in, out); } -template < typename TOut, typename std::enable_if < std::is_same::value || std::is_same::value, int >::type = 0 > -void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +template < + typename TOut, + typename std::enable_if::value || std::is_same::value, int>::type = 0> +void output_stage_nhwc(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) { const bool has_bias = bias != nullptr; using VectorType = typename wrapper::traits::neon_bitvector_t; @@ -329,62 +351,65 @@ void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, Iterator bi(bias, window_bias); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast(in.ptr()) + x; - int32x4x4_t v_in = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast(in.ptr()) + x; + int32x4x4_t v_in = {{ + wrapper::vloadq(in_ptr), + wrapper::vloadq(in_ptr + 4), + wrapper::vloadq(in_ptr + 8), + wrapper::vloadq(in_ptr + 12), + }}; + + // Accumulate bias + if (has_bias) { - wrapper::vloadq(in_ptr), - wrapper::vloadq(in_ptr + 4), - wrapper::vloadq(in_ptr + 8), - wrapper::vloadq(in_ptr + 12), - } - }; + const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; - // Accumulate bias - if(has_bias) - { - const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; + wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr)); + wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4)); + wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8)); + wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12)); + } - wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr)); - wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4)); - wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8)); - wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12)); + const auto out_ptr = reinterpret_cast(out.ptr()) + x; + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift_s32, min, max, false)); } - const auto out_ptr = reinterpret_cast(out.ptr()) + x; - wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false)); - } + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast(in.ptr()) + x; + int32_t s_in = *in_ptr; - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast(in.ptr()) + x; - int32_t s_in = *in_ptr; + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; + s_in += *bias_ptr; + } - // Accumulate bias - if(has_bias) - { - const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; - s_in += *bias_ptr; + const auto out_ptr = reinterpret_cast(out.ptr()) + x; + *out_ptr = + finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, + std::numeric_limits::lowest(), std::numeric_limits::max(), false); } - - const auto out_ptr = reinterpret_cast(out.ptr()) + x; - *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, - std::numeric_limits::lowest(), std::numeric_limits::max(), false); - } - }, - in, bi, out); + }, + in, bi, out); } } // namespace -void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, +void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, const DirectConvolutionLayerOutputStageKernelInfo &info) { ARM_COMPUTE_UNUSED(bias); @@ -398,7 +423,7 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor _result_offset_after_shift = info.result_offset_after_shift; // Auto-initialize output output if required - if(dst != nullptr) + if (dst != nullptr) { // Work out expected output data type const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32; @@ -410,16 +435,17 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor ICpuKernel::configure(win); - const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false; + const bool is_qasymm8_signed = + (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false; // Set appropriate function - if(src->data_layout() == DataLayout::NCHW) + if (src->data_layout() == DataLayout::NCHW) { - switch(src->data_type()) + switch (src->data_type()) { case DataType::S32: { - if(is_qasymm8_signed) + if (is_qasymm8_signed) { _func = &output_stage_nchw; } @@ -449,11 +475,11 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor } else { - switch(src->data_type()) + switch (src->data_type()) { case DataType::S32: { - if(is_qasymm8_signed) + if (is_qasymm8_signed) { _func = &output_stage_nhwc; } @@ -483,7 +509,9 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor } } -Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, +Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, const DirectConvolutionLayerOutputStageKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h index d3ef17b7c9..ce84f49cf6 100644 --- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h +++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -55,29 +56,40 @@ public: * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32 * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata */ - void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr, - const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); + void + configure(ITensorInfo *src, + const ITensorInfo *bias = nullptr, + ITensorInfo *dst = nullptr, + const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDirectConv2dOutputStageKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr, - const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); + static Status + validate(const ITensorInfo *src, + const ITensorInfo *bias = nullptr, + const ITensorInfo *dst = nullptr, + const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift); + using OutputStageKernel = void(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift); - OutputStageKernel *_func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _result_offset_after_shift{ 0 }; + OutputStageKernel *_func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _result_offset_after_shift{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.cpp b/src/cpu/kernels/CpuDirectConv3dKernel.cpp index 22c60cd994..b5b2aed1ba 100644 --- a/src/cpu/kernels/CpuDirectConv3dKernel.cpp +++ b/src/cpu/kernels/CpuDirectConv3dKernel.cpp @@ -29,12 +29,13 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/conv3d/neon/list.h" #include @@ -49,43 +50,37 @@ namespace kernels { namespace { -static const std::vector available_kernels = -{ +static const std::vector available_kernels = { #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_directconv3d", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc) - }, + {"neon_fp16_directconv3d", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc)}, #endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - { - "neon_fp32_directconv3d", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc) - }, - { - "neon_qasymm8_directconv3d", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc) - }, - { - "neon_qasymm8_signed_directconv3d", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc) - } -}; - -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info) + {"neon_fp32_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc)}, + {"neon_qasymm8_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc)}, + {"neon_qasymm8_signed_directconv3d", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc)}}; + +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON(src0->data_layout() != DataLayout::NDHWC); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size3D(1U, 1U, 1U)); - const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); @@ -96,9 +91,9 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 5); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != src0->dimension(channel_idx)); - if(src2 != nullptr) + if (src2 != nullptr) { - if(is_data_type_quantized(src0->data_type())) + if (is_data_type_quantized(src0->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32); } @@ -106,14 +101,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), + "Biases size and number of dst feature maps should match"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional"); } // Checks performed when output is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info); + TensorShape output_shape = + misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info); DataType data_type = src0->data_type(); @@ -125,12 +122,17 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } // namespace -void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info) +void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv_info) { ARM_COMPUTE_UNUSED(src2); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); @@ -139,7 +141,8 @@ void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo _name = std::string("CpuDirectConv3dKernel").append("/").append(uk->name); // Get convolved dimensions - TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info); + TensorShape output_shape = + misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info); DataType data_type = src0->data_type(); @@ -154,7 +157,11 @@ void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo ICpuKernel::configure(win); } -Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info) +Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv_info)); @@ -188,4 +195,4 @@ const std::vector &CpuDirectConv3dKer } // namespace kernels } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.h b/src/cpu/kernels/CpuDirectConv3dKernel.h index 688f368b9f..8e6f564679 100644 --- a/src/cpu/kernels/CpuDirectConv3dKernel.h +++ b/src/cpu/kernels/CpuDirectConv3dKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_DIRECT_CONV3D_KERNEL_H #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -39,7 +40,8 @@ class CpuDirectConv3dKernel : public ICpuKernel { private: /* Template function for convolution 3d NDHWC */ - using DirectConv3dKernelPtr = std::add_pointer::type; + using DirectConv3dKernelPtr = std::add_pointer::type; public: CpuDirectConv3dKernel() = default; @@ -63,17 +65,25 @@ public: * @param[in] conv_info Contains padding, stride, acitvation information. * */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info); + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv_info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDirectConv3dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct DirectConv3dKernel @@ -87,7 +97,7 @@ public: private: Conv3dInfo _conv_info{}; - DirectConv3dKernelPtr _run_method{ nullptr }; + DirectConv3dKernelPtr _run_method{nullptr}; std::string _name{}; }; diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp index a045855b1a..57a3f39822 100644 --- a/src/cpu/kernels/CpuElementwiseKernel.cpp +++ b/src/cpu/kernels/CpuElementwiseKernel.cpp @@ -24,8 +24,9 @@ #include "src/cpu/kernels/CpuElementwiseKernel.h" #include "arm_compute/core/Helpers.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/elementwise_binary/list.h" @@ -35,11 +36,11 @@ #if defined(ENABLE_FP32_KERNELS) namespace { - static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308; - static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772; - static constexpr size_t default_div_mws_N1_fp32_neon = 19043; - static constexpr size_t default_div_mws_V1_fp32_neon = 25511; -} +static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308; +static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772; +static constexpr size_t default_div_mws_N1_fp32_neon = 19043; +static constexpr size_t default_div_mws_V1_fp32_neon = 25511; +} // namespace #endif /* ENABLE_FP32_KERNELS */ namespace arm_compute @@ -50,255 +51,178 @@ namespace kernels { namespace { -template -const std::vector::ElementwiseKernel> available_kernels_arithmetic = -{ - { - "sve2_qu8_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast(data.op) == op; - }, - REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary) - }, - { - "sve2_qs8_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast(data.op) == op; - }, - REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary) - }, - { - "sve_fp32_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F32 && data.isa.sve && static_cast(data.op) == op; - }, - REGISTER_FP32_SVE(sve_fp32_elementwise_binary) - }, - { - "sve_s32_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S32 && data.isa.sve && static_cast(data.op) == op; - }, - REGISTER_INTEGER_SVE(sve_s32_elementwise_binary) - }, - { - "sve_s16_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S16 && data.isa.sve && static_cast(data.op) == op; - }, - REGISTER_INTEGER_SVE(sve_s16_elementwise_binary) - }, - { - "sve_fp16_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast(data.op) == op; - }, - REGISTER_FP16_SVE(sve_fp16_elementwise_binary) - }, - { - "neon_fp32_arithmetic", - - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F32 && static_cast(data.op) == op; - }, - REGISTER_FP32_NEON(neon_fp32_elementwise_binary) - }, - { - "neon_s32_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S32 && static_cast(data.op) == op; - }, - REGISTER_INTEGER_NEON(neon_s32_elementwise_binary) - }, - { - "neon_fp16_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.fp16 && static_cast(data.op) == op; - }, - REGISTER_FP16_NEON(neon_fp16_elementwise_binary) - }, - { - "neon_s16_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S16 && static_cast(data.op) == op; - }, - REGISTER_INTEGER_NEON(neon_s16_elementwise_binary) - }, - { - "neon_qu8_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 && static_cast(data.op) == op; - }, - REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary) - }, - { - "neon_qs8_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED && static_cast(data.op) == op; - }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary) - }, +template +const std::vector::ElementwiseKernel> available_kernels_arithmetic = { + {"sve2_qu8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast(data.op) == op; }, + REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary)}, + {"sve2_qs8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast(data.op) == op; + }, + REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary)}, + {"sve_fp32_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && static_cast(data.op) == op; }, + REGISTER_FP32_SVE(sve_fp32_elementwise_binary)}, + {"sve_s32_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && data.isa.sve && static_cast(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s32_elementwise_binary)}, + {"sve_s16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && data.isa.sve && static_cast(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s16_elementwise_binary)}, + {"sve_fp16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + static_cast(data.op) == op; + }, + REGISTER_FP16_SVE(sve_fp16_elementwise_binary)}, + {"neon_fp32_arithmetic", + + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && static_cast(data.op) == op; }, + REGISTER_FP32_NEON(neon_fp32_elementwise_binary)}, + {"neon_s32_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && static_cast(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s32_elementwise_binary)}, + {"neon_fp16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && static_cast(data.op) == op; }, + REGISTER_FP16_NEON(neon_fp16_elementwise_binary)}, + {"neon_s16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && static_cast(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s16_elementwise_binary)}, + {"neon_qu8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && static_cast(data.op) == op; }, + REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary)}, + {"neon_qs8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8_SIGNED && static_cast(data.op) == op; }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary)}, }; -template -const std::vector::ElementwiseKernel> available_kernels_comperison = -{ - { - "sve2_qu8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast(data.op) == op; - }, - REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary) - }, - { - "sve2_qs8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast(data.op) == op; - }, - REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary) - }, - { - "sve_u8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::U8 && data.isa.sve && static_cast(data.op) == op; - }, - REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary) - }, - { - "sve_fp32_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F32 && data.isa.sve && static_cast(data.op) == op; - }, - REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary) - }, - { - "sve_s16_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S16 && data.isa.sve && static_cast(data.op) == op; - }, - REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary) - }, - { - "sve_s32_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S32 && data.isa.sve && static_cast(data.op) == op; - }, - REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary) - }, - { - "sve_fp16_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast(data.op) == op; - }, - REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary) - }, - { - "neon_u8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::U8 && static_cast(data.op) == op; - }, - REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary) - }, - { - "neon_fp32_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F32 && static_cast(data.op) == op; - }, - REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary) - }, - { - "neon_s16_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S16 && static_cast(data.op) == op; - }, - REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary) - }, - { - "neon_s32_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S32 && static_cast(data.op) == op; - }, - REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary) - }, - { - "neon_qu8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 && static_cast(data.op) == op; - }, - REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary) - }, - { - "neon_qs8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED && static_cast(data.op) == op; - }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary) - }, - { - "neon_fp16_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.fp16 && static_cast(data.op) == op; - }, - REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary) - }, +template +const std::vector::ElementwiseKernel> available_kernels_comperison = { + {"sve2_qu8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast(data.op) == op; }, + REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary)}, + {"sve2_qs8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast(data.op) == op; + }, + REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary)}, + {"sve_u8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::U8 && data.isa.sve && static_cast(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary)}, + {"sve_fp32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && static_cast(data.op) == op; }, + REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary)}, + {"sve_s16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && data.isa.sve && static_cast(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary)}, + {"sve_s32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && data.isa.sve && static_cast(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary)}, + {"sve_fp16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + static_cast(data.op) == op; + }, + REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary)}, + {"neon_u8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::U8 && static_cast(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary)}, + {"neon_fp32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && static_cast(data.op) == op; }, + REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary)}, + {"neon_s16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && static_cast(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary)}, + {"neon_s32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && static_cast(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary)}, + {"neon_qu8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && static_cast(data.op) == op; }, + REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary)}, + {"neon_qs8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8_SIGNED && static_cast(data.op) == op; }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary)}, + {"neon_fp16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && static_cast(data.op) == op; }, + REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary)}, }; } // namespace -const std::vector::ElementwiseKernel> &CpuArithmeticKernel::get_available_kernels() +const std::vector::ElementwiseKernel> & +CpuArithmeticKernel::get_available_kernels() { static std::vector::ElementwiseKernel> available_kernels; - std::move(available_kernels_arithmetic.begin(), available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic.begin(), available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic.begin(), available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic.begin(), available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic.begin(), available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic.begin(), available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic.begin(), available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic.begin(), available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic.begin(), + available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic.begin(), + available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic.begin(), + available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic.begin(), + available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic.begin(), + available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic.begin(), + available_kernels_arithmetic.end(), + std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic.begin(), + available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic.begin(), + available_kernels_arithmetic.end(), std::back_inserter(available_kernels)); return available_kernels; } -const std::vector::ElementwiseKernel> &CpuComparisonKernel::get_available_kernels() +const std::vector::ElementwiseKernel> & +CpuComparisonKernel::get_available_kernels() { static std::vector::ElementwiseKernel> available_kernels; - std::move(available_kernels_comperison.begin(), available_kernels_comperison.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_comperison.begin(), available_kernels_comperison.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_comperison.begin(), available_kernels_comperison.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_comperison.begin(), available_kernels_comperison.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_comperison.begin(), available_kernels_comperison.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_comperison.begin(), available_kernels_comperison.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison.begin(), + available_kernels_comperison.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison.begin(), + available_kernels_comperison.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison.begin(), + available_kernels_comperison.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison.begin(), + available_kernels_comperison.end(), + std::back_inserter(available_kernels)); + std::move(available_kernels_comperison.begin(), + available_kernels_comperison.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison.begin(), + available_kernels_comperison.end(), + std::back_inserter(available_kernels)); return available_kernels; } template -Status CpuElementwiseKernel::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) +Status CpuElementwiseKernel::validate_arguments_common(const ITensorInfo &src0, + const ITensorInfo &src1, + const ITensorInfo &dst) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); @@ -308,7 +232,7 @@ Status CpuElementwiseKernel::validate_arguments_common(const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for output"); @@ -321,7 +245,8 @@ void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITenso { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - const auto *uk = CpuArithmeticKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast(_op) }); + const auto *uk = CpuArithmeticKernel::get_implementation( + ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast(_op)}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); @@ -329,7 +254,7 @@ void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITenso _name = std::string("CpuArithmeticKernel").append("/").append(uk->name); // If any of shapes is dynamic, expect a configured window and dst at run-time. - if(src0->is_dynamic() || src1->is_dynamic()) + if (src0->is_dynamic() || src1->is_dynamic()) { return; } @@ -343,7 +268,8 @@ void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITenso { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - const auto *uk = CpuComparisonKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast(_op) }); + const auto *uk = CpuComparisonKernel::get_implementation( + ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast(_op)}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); @@ -351,7 +277,7 @@ void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITenso _name = std::string("CpuComparisonKernel").append("/").append(uk->name); // If any of shapes is dynamic, expect a configured window and dst at run-time. - if(src0->is_dynamic() || src1->is_dynamic()) + if (src0->is_dynamic() || src1->is_dynamic()) { return; } @@ -373,8 +299,10 @@ void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &w _run_method(src0, src1, dst, window); } -template void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info); -template void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info); +template void +CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info); +template void +CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info); template const char *CpuElementwiseKernel::name() const @@ -385,7 +313,10 @@ template const char *CpuElementwiseKernel::name() const; template const char *CpuElementwiseKernel::name() const; /** Arithmetic operators (min, max, squared_diff) */ -void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +void CpuArithmeticKernel::configure(ArithmeticOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); _op = op; @@ -394,16 +325,20 @@ void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *s Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::F16, DataType::S32, DataType::F32); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); } return validate_arguments_common(src0, src1, dst); } -Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +Status CpuArithmeticKernel::validate(ArithmeticOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst) { ARM_COMPUTE_UNUSED(op); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); @@ -416,15 +351,15 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count ARM_COMPUTE_UNUSED(thread_count); #if defined(ENABLE_FP32_KERNELS) - if(this->_run_method == &neon_fp32_elementwise_binary - || this->_run_method == &neon_fp32_elementwise_binary) + if (this->_run_method == &neon_fp32_elementwise_binary || + this->_run_method == &neon_fp32_elementwise_binary) { size_t mws = ICPPKernel::default_mws; - if(platform.get_cpu_model() == CPUModel::N1) + if (platform.get_cpu_model() == CPUModel::N1) { mws = default_min_max_mws_N1_fp32_neon; } - else if(platform.get_cpu_model() == CPUModel::V1) + else if (platform.get_cpu_model() == CPUModel::V1) { mws = default_min_max_mws_V1_fp32_neon; } @@ -434,7 +369,7 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count } // tensor is 1D or was re-interpreted as 1D - if(this->window().shape().num_dimensions() == 1) + if (this->window().shape().num_dimensions() == 1) { return mws; } @@ -447,7 +382,7 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count return std::max(static_cast(1), mws); } } -#else /* ENABLE_FP32_KERNELS */ +#else /* ENABLE_FP32_KERNELS */ ARM_COMPUTE_UNUSED(platform); #endif /* ENABLE_FP32_KERNELS */ return ICPPKernel::default_mws; @@ -467,14 +402,14 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) ARM_COMPUTE_UNUSED(thread_count); #if defined(ENABLE_FP32_KERNELS) - if(this->_run_method == &neon_fp32_elementwise_binary) + if (this->_run_method == &neon_fp32_elementwise_binary) { size_t mws = ICPPKernel::default_mws; - if(platform.get_cpu_model() == CPUModel::N1) + if (platform.get_cpu_model() == CPUModel::N1) { mws = default_div_mws_N1_fp32_neon; } - else if(platform.get_cpu_model() == CPUModel::V1) + else if (platform.get_cpu_model() == CPUModel::V1) { mws = default_div_mws_V1_fp32_neon; } @@ -484,7 +419,7 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) } // tensor is 1D or was re-interpreted as 1D - if(this->window().shape().num_dimensions() == 1) + if (this->window().shape().num_dimensions() == 1) { return mws; } @@ -497,7 +432,7 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) return std::max(static_cast(1), mws); } } -#else /* ENABLE_FP32_KERNELS */ +#else /* ENABLE_FP32_KERNELS */ ARM_COMPUTE_UNUSED(platform); #endif /* ENABLE_FP32_KERNELS */ return ICPPKernel::default_mws; @@ -538,7 +473,10 @@ Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1 } /** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */ -void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +void CpuComparisonKernel::configure(ComparisonOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); _op = op; @@ -547,16 +485,21 @@ void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *s Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, + DataType::S32, DataType::F32); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8); } return validate_arguments_common(src0, src1, dst); } -Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +Status CpuComparisonKernel::validate(ComparisonOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst) { ARM_COMPUTE_UNUSED(op); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h index 634e38bf9f..1f3e613b80 100644 --- a/src/cpu/kernels/CpuElementwiseKernel.h +++ b/src/cpu/kernels/CpuElementwiseKernel.h @@ -43,7 +43,8 @@ template class CpuElementwiseKernel : public ICpuKernel { private: - using ElementwiseKernelPtr = std::add_pointer::type; + using ElementwiseKernelPtr = + std::add_pointer::type; public: CpuElementwiseKernel() = default; @@ -72,7 +73,7 @@ protected: static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); protected: - ElementwiseKernelPtr _run_method{ nullptr }; + ElementwiseKernelPtr _run_method{nullptr}; std::string _name{}; }; @@ -96,7 +97,8 @@ public: * * @return a status */ - static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + static Status + validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); static const std::vector::ElementwiseKernel> &get_available_kernels(); @@ -200,7 +202,8 @@ public: * * @return a status */ - static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + static Status + validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); static const std::vector::ElementwiseKernel> &get_available_kernels(); @@ -226,4 +229,4 @@ private: } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp index 04a7f15715..88545ee756 100644 --- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/elementwise_unary/list.h" @@ -59,12 +60,13 @@ std::unique_ptr q8_prepare_lut(ElementWiseUnary op, const ITensorInfo const auto dst_min_fp = (((is_signed) ? -128 : 0) - dst_qi.offset) * dst_qi.scale; const auto dst_max_fp = (((is_signed) ? 127 : 255) - dst_qi.offset) * dst_qi.scale; - for(int i = 0; i < 256; ++i) + for (int i = 0; i < 256; ++i) { - const auto in = (is_signed) ? dequantize_qasymm8_signed(static_cast(i), src_qi) : dequantize_qasymm8(i, src_qi); - float result = 0; + const auto in = + (is_signed) ? dequantize_qasymm8_signed(static_cast(i), src_qi) : dequantize_qasymm8(i, src_qi); + float result = 0; - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: result = 1 / sqrt(in); @@ -100,7 +102,8 @@ std::unique_ptr q8_prepare_lut(ElementWiseUnary op, const ITensorInfo result = utility::clamp(result, dst_min_fp, dst_max_fp); - const auto out = (is_signed) ? static_cast(quantize_qasymm8_signed(result, dst_qi)) : quantize_qasymm8(result, dst_qi); + const auto out = (is_signed) ? static_cast(quantize_qasymm8_signed(result, dst_qi)) + : quantize_qasymm8(result, dst_qi); lut[i] = out; } @@ -109,97 +112,68 @@ std::unique_ptr q8_prepare_lut(ElementWiseUnary op, const ITensorInfo #endif // __aarch64__ -static const std::vector available_kernels = -{ +static const std::vector available_kernels = { { "sve_fp32_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F32 && data.isa.sve); - }, + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32 && data.isa.sve); }, REGISTER_FP32_SVE(sve_fp32_elementwise_unary), nullptr, }, { "sve_fp16_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16); - }, + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16); }, REGISTER_FP16_SVE(sve_fp16_elementwise_unary), nullptr, }, { "sve_s32_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::S32 && data.isa.sve); - }, + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::S32 && data.isa.sve); }, REGISTER_INTEGER_SVE(sve_s32_elementwise_unary), nullptr, }, { "neon_fp32_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::F32; - }, + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(neon_fp32_elementwise_unary), nullptr, }, { "neon_fp16_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.fp16; - }, + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, REGISTER_FP16_NEON(neon_fp16_elementwise_unary), nullptr, }, { "neon_s32_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::S32; - }, + [](const DataTypeISASelectorData &data) { return data.dt == DataType::S32; }, REGISTER_INTEGER_NEON(neon_s32_elementwise_unary), nullptr, }, #ifdef __aarch64__ { "sve2_q8_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; - }, + [](const DataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, REGISTER_QASYMM8_SVE2(sve2_q8_elementwise_unary), &q8_prepare_lut, }, { "neon_q8_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; - }, + [](const DataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; }, REGISTER_QASYMM8_NEON(neon_q8_elementwise_unary), &q8_prepare_lut, }, #else // __aarch64__ { "neon_qasymm8_signed_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED; - }, + [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_unary), nullptr, }, { "neon_qasymm8_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8; - }, + [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_unary), nullptr, }, @@ -211,7 +185,8 @@ static const std::vector avai void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst) { ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst)); - const auto uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuElementwiseUnaryKernel::get_implementation( + DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); _op = op; @@ -219,12 +194,12 @@ void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo _name = std::string("CpuElementwiseUnaryKernel").append("/").append(uk->name); // If input shape is dynamic, expect a configured window and dst at run-time. - if(src.is_dynamic()) + if (src.is_dynamic()) { return; } - if(uk->prepare_func != nullptr) + if (uk->prepare_func != nullptr) { _lut = uk->prepare_func(op, &src, &dst); } @@ -238,28 +213,31 @@ Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInf { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); - const auto *uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() }); + const auto *uk = CpuElementwiseUnaryKernel::get_implementation( + DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - switch(op) + switch (op) { case ElementWiseUnary::EXP: case ElementWiseUnary::RSQRT: case ElementWiseUnary::LOG: case ElementWiseUnary::ROUND: case ElementWiseUnary::SIN: - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, + DataType::QASYMM8, DataType::QASYMM8_SIGNED); break; case ElementWiseUnary::NEG: case ElementWiseUnary::ABS: - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32, + DataType::QASYMM8, DataType::QASYMM8_SIGNED); break; default: ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported"); } // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); } diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h index 00188f0d49..249909854e 100644 --- a/src/cpu/kernels/CpuElementwiseUnaryKernel.h +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -42,8 +43,10 @@ namespace kernels class CpuElementwiseUnaryKernel : public ICpuKernel { private: - using ElementwiseUnaryUkernelPtr = std::add_pointer::type; - using ElementwiseUnaryPreparePtr = std::add_pointer(ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type; + using ElementwiseUnaryUkernelPtr = + std::add_pointer::type; + using ElementwiseUnaryPreparePtr = std::add_pointer( + ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type; public: CpuElementwiseUnaryKernel() = default; @@ -65,7 +68,7 @@ public: static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct ElementwiseUnaryKernel @@ -80,7 +83,7 @@ public: private: ElementWiseUnary _op{}; - ElementwiseUnaryUkernelPtr _run_method{ nullptr }; + ElementwiseUnaryUkernelPtr _run_method{nullptr}; std::string _name{}; std::unique_ptr _lut{}; }; diff --git a/src/cpu/kernels/CpuFillKernel.cpp b/src/cpu/kernels/CpuFillKernel.cpp index f69de0082d..754da97ae1 100644 --- a/src/cpu/kernels/CpuFillKernel.cpp +++ b/src/cpu/kernels/CpuFillKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -68,17 +69,18 @@ void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const Thr collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator tensor_it(inout, collapsed); - execute_window_loop(collapsed, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + tensor_it.offset(); - // Set memory - for(int i = 0; i < window_width; ++i) + execute_window_loop( + collapsed, + [&](const Coordinates &) { - std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size); - } - - }, - tensor_it); + uint8_t *base_addr = start_valid_region + tensor_it.offset(); + // Set memory + for (int i = 0; i < window_width; ++i) + { + std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size); + } + }, + tensor_it); } const char *CpuFillKernel::name() const diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h index ce41afc462..7c200c9b59 100644 --- a/src/cpu/kernels/CpuFillKernel.h +++ b/src/cpu/kernels/CpuFillKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_FILL_KERNEL_H #include "arm_compute/core/PixelValue.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -48,7 +49,7 @@ public: void configure(const ITensorInfo *tensor, const PixelValue &constant_value); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: diff --git a/src/cpu/kernels/CpuFloorKernel.cpp b/src/cpu/kernels/CpuFloorKernel.cpp index 65e390a81a..df7e6aad46 100644 --- a/src/cpu/kernels/CpuFloorKernel.cpp +++ b/src/cpu/kernels/CpuFloorKernel.cpp @@ -27,11 +27,11 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" #include "src/cpu/kernels/floor/list.h" namespace arm_compute @@ -42,29 +42,22 @@ namespace kernels { namespace { -static const std::vector available_kernels = -{ - { - "neon_fp16_floor", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor) - }, - { - "neon_fp32_floor", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor) - } -}; +static const std::vector available_kernels = { + {"neon_fp16_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)}, + {"neon_fp32_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)}}; Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); // Validate in case of configured output - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -81,7 +74,8 @@ void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst) auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type()); - const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _run_method = uk->ukernel; @@ -122,17 +116,14 @@ void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const Th ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); const auto len = static_cast(window.x().end()) - static_cast(window.x().start()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src_it(src, win); Iterator dst_it(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - _run_method(src_it.ptr(), dst_it.ptr(), len); - }, - src_it, dst_it); + execute_window_loop( + win, [&](const Coordinates &) { _run_method(src_it.ptr(), dst_it.ptr(), len); }, src_it, dst_it); } const char *CpuFloorKernel::name() const diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h index 35ab534ca8..57107d0532 100644 --- a/src/cpu/kernels/CpuFloorKernel.h +++ b/src/cpu/kernels/CpuFloorKernel.h @@ -65,7 +65,7 @@ public: Window infer_window(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct FloorKernel @@ -78,7 +78,7 @@ public: static const std::vector &get_available_kernels(); private: - FloorKernelPtr _run_method{ nullptr }; + FloorKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp index 9fbf2d54c6..db433c99a8 100644 --- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp +++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp @@ -24,9 +24,10 @@ #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -60,7 +61,7 @@ Status CpuGemmInterleave4x4Kernel::validate(const ITensorInfo *src, const ITenso //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorShape dst_shape = compute_interleaved_shape(*src); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); @@ -111,35 +112,42 @@ void CpuGemmInterleave4x4Kernel::run_op(ITensorPack &tensors, const Window &wind Iterator in(src, win); Iterator out(dst, win_out); - execute_window_loop(win, [&](const Coordinates & id) - { - if(id.y() + 4 <= static_cast(in_height)) + execute_window_loop( + win, + [&](const Coordinates &id) { - for(size_t x = window_start_x; x < window_end_x; ++x) + if (id.y() + 4 <= static_cast(in_height)) { - std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, element_size); - std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, element_size); - std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, element_size); - std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, element_size); - } - } - else - { - for(size_t x = window_start_x; x < window_end_x; ++x) - { - size_t y = 0; - for(; y < partial_y; ++y) + for (size_t x = window_start_x; x < window_end_x; ++x) { - std::memcpy(out.ptr() + (x * 4 + y) * element_size, (in.ptr() + y * in_stride) + x * element_size, element_size); + std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, + element_size); + std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, + element_size); + std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, + element_size); + std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, + element_size); } - for(; y < 4; ++y) + } + else + { + for (size_t x = window_start_x; x < window_end_x; ++x) { - std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size); + size_t y = 0; + for (; y < partial_y; ++y) + { + std::memcpy(out.ptr() + (x * 4 + y) * element_size, + (in.ptr() + y * in_stride) + x * element_size, element_size); + } + for (; y < 4; ++y) + { + std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size); + } } } - } - }, - in, out); + }, + in, out); } const char *CpuGemmInterleave4x4Kernel::name() const diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h index 4fb6a52a8b..2ce34bc4bc 100644 --- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h +++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h @@ -71,7 +71,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp index f8bef64066..a3ed2cd171 100644 --- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,646 +45,494 @@ namespace kernels { namespace { -void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window) +void inline vector_matrix_multiply_u8(Iterator &ina, + Iterator &inb, + Iterator &out, + int width_a, + int width_b, + int width_out, + size_t stride_b, + const Window &window) { - execute_window_loop(window, [&](const Coordinates & id) - { - if(id.x() > width_b) - { - return; - } - - // Note: Since the input are all positives, we can use uint32_t - // Accumulators for the block 0 - uint32x4x4_t c0 = + execute_window_loop( + window, + [&](const Coordinates &id) { + if (id.x() > width_b) { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) + return; } - }; - - auto vec_a = reinterpret_cast(ina.ptr()); - auto matrix_b = reinterpret_cast(inb.ptr()); - auto vec_a_end_addr = vec_a + width_a; - - // This for loop performs 8 accumulations - for(; vec_a <= (vec_a_end_addr - 8);) - { - const uint8x8_t a00_u8 = vld1_u8(vec_a); - const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b); - const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b); - const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b); - const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b); - const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b); - const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b); - const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b); - const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b); - - // Convert a00_u8 to uint16_t and get the lower part - const uint16x4x2_t a00_u16 = - { - { - vget_low_u16(vmovl_u8(a00_u8)), - vget_high_u16(vmovl_u8(a00_u8)) - } - }; - - const uint16x4x4_t b00_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) - } - }; - - const uint16x4x4_t b10_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b10_u8))) - } - }; - - const uint16x4x4_t b20_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b20_u8))) - } - }; - const uint16x4x4_t b30_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b30_u8))) - } - }; + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + auto vec_a = reinterpret_cast(ina.ptr()); + auto matrix_b = reinterpret_cast(inb.ptr()); + auto vec_a_end_addr = vec_a + width_a; + + // This for loop performs 8 accumulations + for (; vec_a <= (vec_a_end_addr - 8);) + { + const uint8x8_t a00_u8 = vld1_u8(vec_a); + const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b); + const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b); + const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b); + const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b); + const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b); + const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b); + const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b); + const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b); + + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4x2_t a00_u16 = {{vget_low_u16(vmovl_u8(a00_u8)), vget_high_u16(vmovl_u8(a00_u8))}}; + + const uint16x4x4_t b00_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}}; + + const uint16x4x4_t b10_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))}}; + + const uint16x4x4_t b20_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))}}; + + const uint16x4x4_t b30_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))}}; + + const uint16x4x4_t b40_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))}}; + + const uint16x4x4_t b50_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))}}; + + const uint16x4x4_t b60_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))}}; + + const uint16x4x4_t b70_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))}}; + + // Accumulate 0: + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0); + + // Accumulate 1: + c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1); + c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1); + c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1); + c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1); + + // Accumulate 2: + c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2); + c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2); + c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2); + c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2); + + // Accumulate 3: + c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3); + c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3); + c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3); + c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3); + + // Accumulate 4: + c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0); + + // Accumulate 5: + c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1); + c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1); + c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1); + c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1); + + // Accumulate 6: + c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2); + c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2); + c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2); + c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2); + + // Accumulate 7: + c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3); + c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3); + c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3); + c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3); + + vec_a += 8; + matrix_b += 8 * stride_b; + } - const uint16x4x4_t b40_u16 = + // This for loop performs the left-over accumulations + for (; vec_a < vec_a_end_addr;) { - { - vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b40_u8))) - } - }; + const uint8x8_t a00_u8 = vld1_dup_u8(vec_a); + const uint8x16_t b00_u8 = vld1q_u8(matrix_b); - const uint16x4x4_t b50_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b50_u8))) - } - }; + const uint16x4x4_t b00_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}}; - const uint16x4x4_t b60_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b60_u8))) - } - }; + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); - const uint16x4x4_t b70_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b70_u8))) - } - }; - - // Accumulate 0: - c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0); - - // Accumulate 1: - c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1); - c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1); - c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1); - c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1); - - // Accumulate 2: - c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2); - c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2); - c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2); - c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2); - - // Accumulate 3: - c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3); - c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3); - c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3); - c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3); - - // Accumulate 4: - c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0); - - // Accumulate 5: - c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1); - c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1); - c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1); - c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1); - - // Accumulate 6: - c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2); - c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2); - c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2); - c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2); - - // Accumulate 7: - c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3); - c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3); - c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3); - c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3); - - vec_a += 8; - matrix_b += 8 * stride_b; - } + // Accumulate 0: + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); - // This for loop performs the left-over accumulations - for(; vec_a < vec_a_end_addr;) - { - const uint8x8_t a00_u8 = vld1_dup_u8(vec_a); - const uint8x16_t b00_u8 = vld1q_u8(matrix_b); + vec_a += 1; + matrix_b += stride_b; + } - const uint16x4x4_t b00_u16 = + auto vec_out = reinterpret_cast(out.ptr()); + if (id.x() < (width_out - 16)) { - { - vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) - } - }; - - // Convert a00_u8 to uint16_t and get the lower part - const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); - - // Accumulate 0: - c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); - - vec_a += 1; - matrix_b += stride_b; - } - - auto vec_out = reinterpret_cast(out.ptr()); - if(id.x() < (width_out - 16)) - { - vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0])); - vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1])); - vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2])); - vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3])); - } - else - { - auto left_over = width_out - id.x(); - for(auto k = 0; k < 4 && left_over; ++k) + vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0])); + vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1])); + vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2])); + vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3])); + } + else { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + auto left_over = width_out - id.x(); + for (auto k = 0; k < 4 && left_over; ++k) { - *(vec_out + k * 4 + j) = c0.val[k][j]; + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vec_out + k * 4 + j) = c0.val[k][j]; + } } } - } - }, - ina, inb, out); + }, + ina, inb, out); } -void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window) +void inline vector_matrix_multiply_s8(Iterator &ina, + Iterator &inb, + Iterator &out, + int width_a, + int width_b, + int width_out, + size_t stride_b, + const Window &window) { - execute_window_loop(window, [&](const Coordinates & id) - { - if(id.x() > width_b) - { - return; - } - - // Accumulators for the block 0 - int32x4x4_t c0 = + execute_window_loop( + window, + [&](const Coordinates &id) { + if (id.x() > width_b) { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) + return; } - }; - - auto vec_a = reinterpret_cast(ina.ptr()); - auto matrix_b = reinterpret_cast(inb.ptr()); - auto vec_a_end_addr = vec_a + width_a; - - // This for loop performs 8 accumulations - for(; vec_a <= (vec_a_end_addr - 8);) - { - const int8x8_t a00_s8 = vld1_s8(vec_a); - const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b); - const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b); - const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b); - const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b); - const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b); - const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b); - const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b); - const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b); - - // Convert a00_s8 to int16_t and get the lower part - const int16x4x2_t a00_s16 = - { - { - vget_low_s16(vmovl_s8(a00_s8)), - vget_high_s16(vmovl_s8(a00_s8)) - } - }; - - const int16x4x4_t b00_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) - } - }; - - const int16x4x4_t b10_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b10_s8))) - } - }; - const int16x4x4_t b20_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b20_s8))) - } - }; + // Accumulators for the block 0 + int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + auto vec_a = reinterpret_cast(ina.ptr()); + auto matrix_b = reinterpret_cast(inb.ptr()); + auto vec_a_end_addr = vec_a + width_a; + + // This for loop performs 8 accumulations + for (; vec_a <= (vec_a_end_addr - 8);) + { + const int8x8_t a00_s8 = vld1_s8(vec_a); + const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b); + const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b); + const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b); + const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b); + const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b); + const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b); + const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b); + const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b); + + // Convert a00_s8 to int16_t and get the lower part + const int16x4x2_t a00_s16 = {{vget_low_s16(vmovl_s8(a00_s8)), vget_high_s16(vmovl_s8(a00_s8))}}; + + const int16x4x4_t b00_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}}; + + const int16x4x4_t b10_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))}}; + + const int16x4x4_t b20_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))}}; + + const int16x4x4_t b30_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))}}; + + const int16x4x4_t b40_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))}}; + + const int16x4x4_t b50_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))}}; + + const int16x4x4_t b60_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))}}; + + const int16x4x4_t b70_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))}}; + + // Accumulate 0: + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0); + + // Accumulate 1: + c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1); + c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1); + c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1); + c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1); + + // Accumulate 2: + c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2); + c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2); + c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2); + c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2); + + // Accumulate 3: + c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3); + c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3); + c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3); + c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3); + + // Accumulate 4: + c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0); + + // Accumulate 5: + c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1); + c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1); + c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1); + c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1); + + // Accumulate 6: + c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2); + c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2); + c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2); + c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2); + + // Accumulate 7: + c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3); + c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3); + c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3); + c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3); + + vec_a += 8; + matrix_b += 8 * stride_b; + } - const int16x4x4_t b30_s16 = + // This for loop performs the left-over accumulations + for (; vec_a < vec_a_end_addr;) { - { - vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b30_s8))) - } - }; + const int8x8_t a00_s8 = vld1_dup_s8(vec_a); + const int8x16_t b00_s8 = vld1q_s8(matrix_b); - const int16x4x4_t b40_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b40_s8))) - } - }; + const int16x4x4_t b00_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}}; - const int16x4x4_t b50_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b50_s8))) - } - }; + // Convert a00_s8 to uint16_t and get the lower part + const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); - const int16x4x4_t b60_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b60_s8))) - } - }; - - const int16x4x4_t b70_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b70_s8))) - } - }; - - // Accumulate 0: - c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0); - - // Accumulate 1: - c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1); - c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1); - c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1); - c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1); - - // Accumulate 2: - c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2); - c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2); - c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2); - c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2); - - // Accumulate 3: - c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3); - c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3); - c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3); - c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3); - - // Accumulate 4: - c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0); - - // Accumulate 5: - c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1); - c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1); - c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1); - c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1); - - // Accumulate 6: - c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2); - c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2); - c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2); - c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2); - - // Accumulate 7: - c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3); - c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3); - c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3); - c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3); - - vec_a += 8; - matrix_b += 8 * stride_b; - } + // Accumulate 0: + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); - // This for loop performs the left-over accumulations - for(; vec_a < vec_a_end_addr;) - { - const int8x8_t a00_s8 = vld1_dup_s8(vec_a); - const int8x16_t b00_s8 = vld1q_s8(matrix_b); + vec_a += 1; + matrix_b += stride_b; + } - const int16x4x4_t b00_s16 = + auto vec_out = reinterpret_cast(out.ptr()); + if (id.x() < (width_out - 16)) { - { - vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) - } - }; - - // Convert a00_s8 to uint16_t and get the lower part - const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); - - // Accumulate 0: - c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); - - vec_a += 1; - matrix_b += stride_b; - } - - auto vec_out = reinterpret_cast(out.ptr()); - if(id.x() < (width_out - 16)) - { - vst1q_s32(vec_out + 0, c0.val[0]); - vst1q_s32(vec_out + 4, c0.val[1]); - vst1q_s32(vec_out + 8, c0.val[2]); - vst1q_s32(vec_out + 12, c0.val[3]); - } - else - { - auto left_over = width_out - id.x(); - for(auto k = 0; k < 4 && left_over; ++k) + vst1q_s32(vec_out + 0, c0.val[0]); + vst1q_s32(vec_out + 4, c0.val[1]); + vst1q_s32(vec_out + 8, c0.val[2]); + vst1q_s32(vec_out + 12, c0.val[3]); + } + else { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + auto left_over = width_out - id.x(); + for (auto k = 0; k < 4 && left_over; ++k) { - *(vec_out + k * 4 + j) = c0.val[k][j]; + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vec_out + k * 4 + j) = c0.val[k][j]; + } } } - } - }, - ina, inb, out); + }, + ina, inb, out); } -void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) +void inline matrix_multiply_u8( + Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) { const auto width_out = static_cast(out_info.dimension(0)); const auto height_out = static_cast(out_info.dimension(1)); const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size(); - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8_t *mtx_a0 = ina.ptr(); - const uint8_t *mtx_b0 = inb.ptr(); - - // Note: Since the input are all positives, we can use uint32_t - // Accumulators for the block 0 - uint32x4x4_t c0 = + execute_window_loop( + window, + [&](const Coordinates &id) { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) + const uint8_t *mtx_a0 = ina.ptr(); + const uint8_t *mtx_b0 = inb.ptr(); + + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + // Accumulators for the block 1 + uint32x4x4_t c1 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + // Accumulators for the block 2 + uint32x4x4_t c2 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + // Accumulators for the block 3 + uint32x4x4_t c3 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) + { + const uint8x8_t a00_u8 = vld1_u8(mtx_a0); + const uint8x16_t b00_u8 = vld1q_u8(mtx_b0); + + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); + + // Convert b00_s8 to uint16_t + const uint16x4x4_t b00_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}}; + + // 4x4 block 0 + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); + + // 4x4 block 1 + c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1); + c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1); + c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1); + c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1); + + // 4x4 block 2 + c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2); + c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2); + c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2); + c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2); + + // 4x4 block 3 + c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3); + c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3); + c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3); + c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3); } - }; - // Accumulators for the block 1 - uint32x4x4_t c1 = - { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } - }; + auto mtx_out = reinterpret_cast(out.ptr()); - // Accumulators for the block 2 - uint32x4x4_t c2 = - { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } - }; - - // Accumulators for the block 3 - uint32x4x4_t c3 = - { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } - }; - - for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) - { - const uint8x8_t a00_u8 = vld1_u8(mtx_a0); - const uint8x16_t b00_u8 = vld1q_u8(mtx_b0); - - // Convert a00_u8 to uint16_t and get the lower part - const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); - - // Convert b00_s8 to uint16_t - const uint16x4x4_t b00_u16 = + if (id.y() < height_out && id.x() < (width_out - 16)) { + vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0])); + vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1])); + vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2])); + vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3])); + if (id.y() + 1 < height_out) { - vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) - } - }; - - // 4x4 block 0 - c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); - - // 4x4 block 1 - c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1); - c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1); - c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1); - c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1); - - // 4x4 block 2 - c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2); - c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2); - c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2); - c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2); - - // 4x4 block 3 - c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3); - c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3); - c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3); - c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3); - } - - auto mtx_out = reinterpret_cast(out.ptr()); - - if(id.y() < height_out && id.x() < (width_out - 16)) - { - vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0])); - vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1])); - vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2])); - vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3])); - if(id.y() + 1 < height_out) - { - vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0])); - vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1])); - vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2])); - vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3])); - if(id.y() + 2 < height_out) - { - vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0])); - vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1])); - vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2])); - vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3])); - if(id.y() + 3 < height_out) + vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0])); + vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1])); + vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2])); + vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3])); + if (id.y() + 2 < height_out) { - vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0])); - vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1])); - vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2])); - vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3])); + vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0])); + vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1])); + vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2])); + vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3])); + if (id.y() + 3 < height_out) + { + vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0])); + vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1])); + vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2])); + vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3])); + } } } } - } - else - { - const auto left_over_value = width_out - id.x(); - auto left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + else { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + const auto left_over_value = width_out - id.x(); + auto left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) { - *(mtx_out + k * 4 + j) = c0.val[k][j]; - } - } - if(id.y() + 1 < height_out) - { - left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + for (auto j = 0; j < 4 && left_over; ++j, --left_over) { - *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; + *(mtx_out + k * 4 + j) = c0.val[k][j]; } } - if(id.y() + 2 < height_out) + if (id.y() + 1 < height_out) { left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + for (auto k = 0; k < 4 && left_over; ++k) { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + for (auto j = 0; j < 4 && left_over; ++j, --left_over) { - *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; } } - if(id.y() + 3 < height_out) + if (id.y() + 2 < height_out) { left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + } + } + if (id.y() + 3 < height_out) { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) { - *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + } } } } } } - } - }, - ina, inb, out); + }, + ina, inb, out); } -void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) +void inline matrix_multiply_s8( + Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) { const auto width_out = static_cast(out_info.dimension(0)); const auto height_out = static_cast(out_info.dimension(1)); @@ -691,182 +540,148 @@ void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration // All the values needed for computing a single 4x4 block will be read from consecutive memory positions - execute_window_loop(window, [&](const Coordinates & id) - { - auto *mtx_a0 = reinterpret_cast(ina.ptr()); - auto *mtx_b0 = reinterpret_cast(inb.ptr()); - - // Note: Since the input are all positives, we can use uint32_t - // Accumulators for the block 0 - int32x4x4_t c0 = + execute_window_loop( + window, + [&](const Coordinates &id) { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) + auto *mtx_a0 = reinterpret_cast(ina.ptr()); + auto *mtx_b0 = reinterpret_cast(inb.ptr()); + + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + // Accumulators for the block 1 + int32x4x4_t c1 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + // Accumulators for the block 2 + int32x4x4_t c2 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + // Accumulators for the block 3 + int32x4x4_t c3 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) + { + const int8x8_t a00_s8 = vld1_s8(mtx_a0); + const int8x16_t b00_s8 = vld1q_s8(mtx_b0); + + // Convert a00_s8 to uint16_t and get the lower part + const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); + + // Convert b00_s8 to int16_t + const int16x4x4_t b00_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}}; + + // 4x4 block 0 + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); + + // 4x4 block 1 + c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1); + c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1); + c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1); + c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1); + + // 4x4 block 2 + c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2); + c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2); + c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2); + c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2); + + // 4x4 block 3 + c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3); + c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3); + c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3); + c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3); } - }; - - // Accumulators for the block 1 - int32x4x4_t c1 = - { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) - } - }; - - // Accumulators for the block 2 - int32x4x4_t c2 = - { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) - } - }; - - // Accumulators for the block 3 - int32x4x4_t c3 = - { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) - } - }; - - for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) - { - const int8x8_t a00_s8 = vld1_s8(mtx_a0); - const int8x16_t b00_s8 = vld1q_s8(mtx_b0); - - // Convert a00_s8 to uint16_t and get the lower part - const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); - - // Convert b00_s8 to int16_t - const int16x4x4_t b00_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) - } - }; - - // 4x4 block 0 - c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); - - // 4x4 block 1 - c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1); - c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1); - c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1); - c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1); - - // 4x4 block 2 - c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2); - c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2); - c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2); - c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2); - - // 4x4 block 3 - c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3); - c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3); - c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3); - c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3); - } - auto mtx_out = reinterpret_cast(out.ptr()); - if(id.y() < height_out && id.x() < (width_out - 16)) - { - vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]); - vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]); - vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]); - vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]); - if(id.y() + 1 < height_out) - { - vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]); - vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]); - vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]); - vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]); - if(id.y() + 2 < height_out) - { - vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]); - vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]); - vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]); - vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]); - if(id.y() + 3 < height_out) + auto mtx_out = reinterpret_cast(out.ptr()); + if (id.y() < height_out && id.x() < (width_out - 16)) + { + vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]); + vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]); + vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]); + vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]); + if (id.y() + 1 < height_out) + { + vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]); + vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]); + vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]); + vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]); + if (id.y() + 2 < height_out) { - vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]); - vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]); - vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]); - vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]); + vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]); + vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]); + vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]); + vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]); + if (id.y() + 3 < height_out) + { + vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]); + vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]); + vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]); + vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]); + } } } } - } - else if(id.y() < height_out) - { - const auto left_over_value = width_out - id.x(); - auto left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(mtx_out + k * 4 + j) = c0.val[k][j]; - } - } - if(id.y() + 1 < height_out) + else if (id.y() < height_out) { - left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + const auto left_over_value = width_out - id.x(); + auto left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + for (auto j = 0; j < 4 && left_over; ++j, --left_over) { - *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; + *(mtx_out + k * 4 + j) = c0.val[k][j]; } } - if(id.y() + 2 < height_out) + if (id.y() + 1 < height_out) { left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + for (auto k = 0; k < 4 && left_over; ++k) { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + for (auto j = 0; j < 4 && left_over; ++j, --left_over) { - *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; } } - if(id.y() + 3 < height_out) + if (id.y() + 2 < height_out) { left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + for (auto k = 0; k < 4 && left_over; ++k) { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + for (auto j = 0; j < 4 && left_over; ++j, --left_over) { - *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + } + } + if (id.y() + 3 < height_out) + { + left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + } } } } } } - } - - }, - ina, inb, out); + }, + ina, inb, out); } Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, + DataType::U8); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); TensorShape in0_shape = src0->tensor_shape(); @@ -874,9 +689,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons TensorShape out_shape = dst->tensor_shape(); // Check vector-by-matrix case - if(out_shape[1] == 1) + if (out_shape[1] == 1) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], + "The number of input0's columns must be equal to input1's rows"); } else { @@ -884,8 +700,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons in1_shape.collapse(2); out_shape.collapse(2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], + "Output tensor must have the same number of batches of input0 tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], + "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16"); } @@ -909,20 +728,22 @@ void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const I Window win; // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication - if((dst->dimension(1) == 1)) + if ((dst->dimension(1) == 1)) { // Configure kernel window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x)); } else { - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); } ICpuKernel::configure(win); } -Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +Status +CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst)); return Status{}; @@ -939,12 +760,13 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window auto dst = tensors.get_tensor(TensorType::ACL_DST); // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path - if((dst->info()->dimension(1) == 1)) + if ((dst->info()->dimension(1) == 1)) { const auto width_matrix_a = static_cast(src0->info()->dimension(0)); const auto width_matrix_b = static_cast(src1->info()->dimension(0)); const auto width_out = static_cast(dst->info()->dimension(0)); - const auto in_b_stride = static_cast(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type())); + const auto in_b_stride = + static_cast(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type())); // The implementation computes 16 elements per iteration const int window_start_x = 16 * info.thread_id; @@ -963,7 +785,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(src1->info()->num_dimensions() >= 3) + if (src1->info()->num_dimensions() >= 3) { win_b = window; } @@ -974,18 +796,20 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window Iterator inb(src1, win_b); Iterator out(dst, win_out); - switch(src0->info()->data_type()) + switch (src0->info()->data_type()) { case DataType::S8: case DataType::QASYMM8_SIGNED: { - vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window); + vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, + window); break; } case DataType::U8: case DataType::QASYMM8: { - vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window); + vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, + window); break; } default: @@ -1009,7 +833,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(_slide_matrix_b) + if (_slide_matrix_b) { win_b = window; } @@ -1021,7 +845,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window Iterator inb(src1, win_b); Iterator out(dst, window); - switch(src0->info()->data_type()) + switch (src0->info()->data_type()) { case DataType::S8: case DataType::QASYMM8_SIGNED: @@ -1050,4 +874,4 @@ const char *CpuGemmLowpMatrixMultiplyKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h index 2cc789d6d9..439ada1b47 100644 --- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h +++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h @@ -68,11 +68,11 @@ public: static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - bool _slide_matrix_b{ true }; + bool _slide_matrix_b{true}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp index 534076b97c..9bd1eae663 100644 --- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp @@ -26,9 +26,10 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -38,37 +39,49 @@ namespace kernels { namespace { -Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->dimension(0) != src->dimension(1), + "Output vector must have length equal to the number of rows of the input matrix"); } return Status{}; } -Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->dimension(0) != src->dimension(0), + "Output vector must have length equal to the number of columns of the input matrix"); } return Status{}; } } // namespace -void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -77,7 +90,7 @@ void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITenso _scalar = info.scalar; _mul_by_scalar = info.mul_by_scalar; - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: _func = &CpuGemmLowpMatrixAReductionKernel::run_internal; @@ -98,14 +111,18 @@ void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITenso ICpuKernel::configure(win); } -Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info)); return Status{}; } template -void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window) +void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, + ITensor *dst, + const arm_compute::Window &window) { // Intermediate and final accumulator types using TIAcc = wrapper::traits::promote_t; @@ -121,55 +138,58 @@ void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor Iterator in(src, win_input); Iterator out(dst, collapsed_window); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - auto vsum_row = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - TAcc sum_row = 0; + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + auto vsum_row = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + TAcc sum_row = 0; - const T *matrix_a = reinterpret_cast((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2])); + const T *matrix_a = reinterpret_cast( + (in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2])); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); #endif /* __arm__ */ - int i = 0; - // This for loop performs 16 accumulations - for(; i <= (_k - 16); i += 16) - { - const auto a0_d8 = wrapper::vloadq(matrix_a + i); + int i = 0; + // This for loop performs 16 accumulations + for (; i <= (_k - 16); i += 16) + { + const auto a0_d8 = wrapper::vloadq(matrix_a + i); - // Partial accumulations in U16 - const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8)); + // Partial accumulations in U16 + const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8)); - // Accumulate to U32 - vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0)); - } + // Accumulate to U32 + vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0)); + } - // This for loop performs the leftover accumulations - for(; i < _k; ++i) - { - sum_row += static_cast(matrix_a[i]); - } + // This for loop performs the leftover accumulations + for (; i < _k; ++i) + { + sum_row += static_cast(matrix_a[i]); + } #if defined(__aarch64__) - // Reduction operation available on 64 bit architectures only - sum_row += wrapper::vaddv(vsum_row); + // Reduction operation available on 64 bit architectures only + sum_row += wrapper::vaddv(vsum_row); #else // __aarch64__ - auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row)); - tmp = wrapper::vpadd(tmp, tmp); + auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row)); + tmp = wrapper::vpadd(tmp, tmp); - sum_row += wrapper::vgetlane(tmp, 0); + sum_row += wrapper::vgetlane(tmp, 0); #endif // __aarch64__ - // Multiply by scalar if necessary - if(_mul_by_scalar) - { - sum_row *= _scalar; - } + // Multiply by scalar if necessary + if (_mul_by_scalar) + { + sum_row *= _scalar; + } - *(reinterpret_cast(out.ptr())) = static_cast(sum_row); - }, - in, out); + *(reinterpret_cast(out.ptr())) = static_cast(sum_row); + }, + in, out); } void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) @@ -189,7 +209,9 @@ const char *CpuGemmLowpMatrixAReductionKernel::name() const return "CpuGemmLowpMatrixAReductionKernel"; } -void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info)); @@ -201,7 +223,7 @@ void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITenso // Configure kernel window constexpr unsigned int num_elems_processed_per_iteration = 16; - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: _func = &CpuGemmLowpMatrixBReductionKernel::run_internal; @@ -223,14 +245,19 @@ void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITenso ICpuKernel::configure(win); } -Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info)); return Status{}; } template -void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info) +void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, + ITensor *dst, + const Window &window, + const ThreadInfo &info) { // Intermediate and final accumulator types using TIAcc = wrapper::traits::promote_t; @@ -258,121 +285,116 @@ void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor Iterator inb(src, win_in); Iterator out(dst, win_out); - execute_window_loop(win_out, [&](const Coordinates & id) - { - if(id.x() > width_matrix_b) + execute_window_loop( + win_out, + [&](const Coordinates &id) { - return; - } + if (id.x() > width_matrix_b) + { + return; + } - // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - typename wrapper::traits::neon_bitvector::type sum_col[4] = - { - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) - }; + // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation + typename wrapper::traits::neon_bitvector::type sum_col[4] = { + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{})}; - const auto *matrix_b = reinterpret_cast(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]); + const auto *matrix_b = reinterpret_cast(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride)); + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride)); #endif /* __arm__ */ - int i = 0; - // This for loop performs 4 accumulations - for(; i <= (_k - 4); i += 4) - { - const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); - const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride); - const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride); - const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride); + int i = 0; + // This for loop performs 4 accumulations + for (; i <= (_k - 4); i += 4) + { + const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); + const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride); + const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride); + const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride); #if __arm__ - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride)); #endif /* __arm__ */ - // Partial accumulation in 16bit - typename wrapper::traits::neon_bitvector::type tmp_sum[2] = - { - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) - }; - - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8)); - - // Accumulate to 32bit - sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0])); - sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0])); - sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1])); - sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1])); - - matrix_b += 4 * in_b_stride; - } - - // This for loop perfoms the leftover accumulations - for(; i < _k; ++i) - { - const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); + // Partial accumulation in 16bit + typename wrapper::traits::neon_bitvector::type tmp_sum[2] = { + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{})}; + + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8)); + + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1])); + + matrix_b += 4 * in_b_stride; + } - // Convert S8 to S16 - const typename wrapper::traits::neon_bitvector::type b0_b16[2] + // This for loop perfoms the leftover accumulations + for (; i < _k; ++i) { - wrapper::vmovl(wrapper::vgetlow(b0_b8)), - wrapper::vmovl(wrapper::vgethigh(b0_b8)) - }; + const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); - // Accumulate to 32bit - sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); - sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); - sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); - sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); + // Convert S8 to S16 + const typename wrapper::traits::neon_bitvector::type b0_b16[2]{ + wrapper::vmovl(wrapper::vgetlow(b0_b8)), wrapper::vmovl(wrapper::vgethigh(b0_b8))}; - matrix_b += in_b_stride; - } + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); - // Multiply by scalar if necessary - if(_mul_by_scalar) - { - sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); - sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); - sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); - sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); - } - - auto vector_sum_col = reinterpret_cast(out.ptr()); - if(id.x() + 16 < width_matrix_b) - { - wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); - wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1])); - wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2])); - wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3])); - } - else - { - auto left_over = width_matrix_b - id.x(); - for(auto k = 0; k < 4 && left_over; ++k) + matrix_b += in_b_stride; + } + + // Multiply by scalar if necessary + if (_mul_by_scalar) + { + sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); + sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); + sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); + sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); + } + + auto vector_sum_col = reinterpret_cast(out.ptr()); + if (id.x() + 16 < width_matrix_b) + { + wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); + wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1])); + wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2])); + wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3])); + } + else { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + auto left_over = width_matrix_b - id.x(); + for (auto k = 0; k < 4 && left_over; ++k) { - *(vector_sum_col + k * 4 + j) = sum_col[k][j]; + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vector_sum_col + k * 4 + j) = sum_col[k][j]; + } } } - } - }, - inb, out); + }, + inb, out); } void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) @@ -393,4 +415,4 @@ const char *CpuGemmLowpMatrixBReductionKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h index e469629cdb..20ef17e96d 100644 --- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h +++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h @@ -66,7 +66,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -85,12 +85,14 @@ private: * @param[out] dst Output tensor * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). */ - using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window); + using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, + ITensor *dst, + const Window &window); - CpuGemmLowpMatrixAReductionKernelPtr _func{ nullptr }; - int32_t _k{ 0 }; - int32_t _scalar{ 0 }; - bool _mul_by_scalar{ false }; + CpuGemmLowpMatrixAReductionKernelPtr _func{nullptr}; + int32_t _k{0}; + int32_t _scalar{0}; + bool _mul_by_scalar{false}; }; /** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. @@ -124,7 +126,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -144,12 +146,15 @@ private: * @param[out] dst Output tensor * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). */ - using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info); + using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, + ITensor *dst, + const Window &window, + const ThreadInfo &info); - CpuGemmLowpMatrixBReductionKernelPtr _func{ nullptr }; - int32_t _k{ 0 }; - int32_t _scalar{ 0 }; - bool _mul_by_scalar{ false }; + CpuGemmLowpMatrixBReductionKernelPtr _func{nullptr}; + int32_t _k{0}; + int32_t _scalar{0}; + bool _mul_by_scalar{false}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp index a65f1a33de..e290783021 100644 --- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,32 +45,37 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, - int32_t a_offset, int32_t b_offset) +Status validate_arguments(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + int32_t a_offset, + int32_t b_offset) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); } // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) + if (b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (mm_result->dimension(1) * mm_result->dimension(2))); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) + if (output_shape.num_dimensions() > 1) { const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -80,13 +86,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], "mm_result tensor must have the same number of batches of output tensor"); - if(a_offset != 0) + if (a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } } @@ -94,9 +102,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto return Status{}; } -void run_offset_contribution(const Window &window, - ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, - int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d) +void run_offset_contribution(const Window &window, + ITensor *mm_result, + const ITensor *vector_sum_col, + const ITensor *vector_sum_row, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + bool slide_vector_sum_col, + bool is_gemm3d) { Window collapsed_window = window.collapse_if_possible(window, Window::DimZ); collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -112,7 +126,7 @@ void run_offset_contribution(const Window &window, const size_t sum_col_stride_y = (vector_sum_col != nullptr) ? (vector_sum_col->info()->strides_in_bytes().y()) : 0; Iterator mm_result_it(mm_result, collapsed_window); - if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true + if ((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true { // Set window for vector_sum_col Window win_vector_sum_col(collapsed_window); @@ -131,95 +145,85 @@ void run_offset_contribution(const Window &window, const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); // Offset in case vector_sum_col is batched - const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const size_t batch_offset_col = batch_id * (sum_col_stride_y ); - auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset); - auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); - - // Compute the leftover term due to b_offset. - int32_t b_offset_term_s32 = *(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input); - b_offset_term_s32 *= b_offset; + const int vector_sum_col_batch_offset = + slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) { - // Compute the leftover term due to a_offset. - int32x4x4_t a_offset_term_s32 = - { - { - vld1q_s32(vector_sum_col_ptr + x + 0), - vld1q_s32(vector_sum_col_ptr + x + 4), - vld1q_s32(vector_sum_col_ptr + x + 8), - vld1q_s32(vector_sum_col_ptr + x + 12) - } - }; - - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); - - // Add a_offset_term_s32 and b_offset_term_s32 - int32x4x4_t offset_term_s32 = + const int batch_id = id.z() / depth_input; + const size_t batch_offset_col = batch_id * (sum_col_stride_y); + auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_offset_col + + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); + + // Compute the leftover term due to b_offset. + int32_t b_offset_term_s32 = + *(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input); + b_offset_term_s32 *= b_offset; + + const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset) - } - }; - - offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec)); - offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec)); - offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec)); - offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec)); - - int32x4x4_t in_s32 = + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = { + {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}}; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + // Add a_offset_term_s32 and b_offset_term_s32 + int32x4x4_t offset_term_s32 = { + {vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}}; + + offset_term_s32.val[0] = + vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec)); + offset_term_s32.val[1] = + vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec)); + offset_term_s32.val[2] = + vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec)); + offset_term_s32.val[3] = + vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec)); + + int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}}; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) { - { - vld1q_s32(mm_result_ptr + x + 0), - vld1q_s32(mm_result_ptr + x + 4), - vld1q_s32(mm_result_ptr + x + 8), - vld1q_s32(mm_result_ptr + x + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); - - // Store the result with the offset contribution - vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); - vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); - vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); - vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); - } + // Compute the leftover term due to a_offset. + int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Compute the leftover term due to a_offset. - int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); - - a_offset_term_s32 *= a_offset; + a_offset_term_s32 *= a_offset; - // Add the offset terms to GEMM's result - // Store the result with the offset contribution - mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32; - } - }, - vector_sum_col_it, vector_sum_row_it, mm_result_it); + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32; + } + }, + vector_sum_col_it, vector_sum_row_it, mm_result_it); } - else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true + else if ((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true { ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); @@ -233,54 +237,51 @@ void run_offset_contribution(const Window &window, const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); - // Compute the leftover term due to b_offset. - int32_t b_offset_term_s32 = *(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input); - b_offset_term_s32 *= b_offset; + // Compute the leftover term due to b_offset. + int32_t b_offset_term_s32 = + *(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input); + b_offset_term_s32 *= b_offset; - const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); + const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x4_t in_s32 = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(mm_result_ptr + x + 0), - vld1q_s32(mm_result_ptr + x + 4), - vld1q_s32(mm_result_ptr + x + 8), - vld1q_s32(mm_result_ptr + x + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec); - in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec); - in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec); - in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec); - - // Store the result with the offset contribution - vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); - vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); - vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); - vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); - } - - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Add the offset terms to GEMM's result - // Store the result with the offset contribution - mm_result_ptr[x] += b_offset_term_s32; - } - }, - vector_sum_row_it, mm_result_it); + int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}}; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec); + in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec); + in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec); + in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += b_offset_term_s32; + } + }, + vector_sum_row_it, mm_result_it); } - else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false + else if ((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false { // Set window for vector_sum_col Window win_vector_sum_col(collapsed_window); @@ -290,69 +291,62 @@ void run_offset_contribution(const Window &window, Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); // Offset in case vector_sum_col is batched - const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const size_t batch_offset_col = batch_id * (sum_col_stride_y ); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor - auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset); - auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); + const int vector_sum_col_batch_offset = + slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) { - // Compute the leftover term due to a_offset. - int32x4x4_t a_offset_term_s32 = + const int batch_id = id.z() / depth_input; + const size_t batch_offset_col = + batch_id * + (sum_col_stride_y); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor + auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_offset_col + + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(vector_sum_col_ptr + x + 0), - vld1q_s32(vector_sum_col_ptr + x + 4), - vld1q_s32(vector_sum_col_ptr + x + 8), - vld1q_s32(vector_sum_col_ptr + x + 12) - } - }; - - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); - - int32x4x4_t in_s32 = + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = { + {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}}; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}}; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) { - { - vld1q_s32(mm_result_ptr + x + 0), - vld1q_s32(mm_result_ptr + x + 4), - vld1q_s32(mm_result_ptr + x + 8), - vld1q_s32(mm_result_ptr + x + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]); - - // Store the result with the offset contribution - vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); - vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); - vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); - vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); - } - - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Compute the leftover term due to a_offset. - const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); - - // Add the offset terms to GEMM's result - // Store the result with the offset contribution - mm_result_ptr[x] += a_offset_term_s32 * a_offset; - } - }, - vector_sum_col_it, mm_result_it); + // Compute the leftover term due to a_offset. + const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); + + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += a_offset_term_s32 * a_offset; + } + }, + vector_sum_col_it, mm_result_it); } else // false, false { @@ -362,7 +356,12 @@ void run_offset_contribution(const Window &window, } } // namespace -void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, + ITensorInfo *vector_sum_col, + ITensorInfo *vector_sum_row, + int32_t k, + int32_t a_offset, + int32_t b_offset) { // Perform validate step ARM_COMPUTE_UNUSED(vector_sum_row); @@ -374,7 +373,7 @@ void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITen _k_offset = a_offset * b_offset * k; // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { // Check if vector_sum_col_shape should be slidden or not // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 @@ -387,8 +386,11 @@ void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITen ICpuKernel::configure(win); } -Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, - int32_t a_offset, int32_t b_offset) +Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + int32_t a_offset, + int32_t b_offset) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); return Status{}; @@ -405,11 +407,11 @@ void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Win auto mm_result = tensors.get_tensor(TensorType::ACL_DST); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); + const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 && + mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d); + run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, + _slide_vector_sum_col, reinterpret_as_3d); } const char *CpuGemmLowpOffsetContributionKernel::name() const @@ -418,4 +420,4 @@ const char *CpuGemmLowpOffsetContributionKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h index 3514ca811d..08b2d47529 100644 --- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h @@ -63,24 +63,33 @@ public: * @param[in] a_offset Offset to be added to each element of the matrix A. * @param[in] b_offset Offset to be added to each element of the matrix B. */ - void configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); + void configure(ITensorInfo *mm_result, + ITensorInfo *vector_sum_col, + ITensorInfo *vector_sum_row, + int32_t k, + int32_t a_offset, + int32_t b_offset); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmLowpOffsetContributionKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset); + static Status validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + int32_t a_offset, + int32_t b_offset); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - int32_t _a_offset{ 0 }; - int32_t _b_offset{ 0 }; - int32_t _k_offset{ 0 }; - bool _slide_vector_sum_col{ true }; + int32_t _a_offset{0}; + int32_t _b_offset{0}; + int32_t _k_offset{0}; + bool _slide_vector_sum_col{true}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp index 190487eced..d008842398 100644 --- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp @@ -31,10 +31,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include @@ -48,80 +49,38 @@ namespace { inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x) { - return - { - { - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 12) - } - }; + return {{vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 4), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 8), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 12)}}; } inline int32x4x4_t load(const int32_t *ptr, int32_t x) { - return - { - { - vld1q_s32(ptr + x + 0), - vld1q_s32(ptr + x + 4), - vld1q_s32(ptr + x + 8), - vld1q_s32(ptr + x + 12) - } - }; + return {{vld1q_s32(ptr + x + 0), vld1q_s32(ptr + x + 4), vld1q_s32(ptr + x + 8), vld1q_s32(ptr + x + 12)}}; } inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b) { - return - { - { - vaddq_s32(a.val[0], b), - vaddq_s32(a.val[1], b), - vaddq_s32(a.val[2], b), - vaddq_s32(a.val[3], b) - } - }; + return {{vaddq_s32(a.val[0], b), vaddq_s32(a.val[1], b), vaddq_s32(a.val[2], b), vaddq_s32(a.val[3], b)}}; } inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b) { - return - { - { - vaddq_s32(a.val[0], b.val[0]), - vaddq_s32(a.val[1], b.val[1]), - vaddq_s32(a.val[2], b.val[2]), - vaddq_s32(a.val[3], b.val[3]) - } - }; + return {{vaddq_s32(a.val[0], b.val[0]), vaddq_s32(a.val[1], b.val[1]), vaddq_s32(a.val[2], b.val[2]), + vaddq_s32(a.val[3], b.val[3])}}; } inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar) { - return - { - { - vmulq_n_s32(a.val[0], mul_scalar), - vmulq_n_s32(a.val[1], mul_scalar), - vmulq_n_s32(a.val[2], mul_scalar), - vmulq_n_s32(a.val[3], mul_scalar) - } - }; + return {{vmulq_n_s32(a.val[0], mul_scalar), vmulq_n_s32(a.val[1], mul_scalar), vmulq_n_s32(a.val[2], mul_scalar), + vmulq_n_s32(a.val[3], mul_scalar)}}; } inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier) { - return - { - { - vmulq_s32(a.val[0], vld1q_s32(multilpier)), - vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)), - vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), - vmulq_s32(a.val[3], vld1q_s32(multilpier + 12)) - } - }; + return {{vmulq_s32(a.val[0], vld1q_s32(multilpier)), vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)), + vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))}}; } inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x) @@ -144,18 +103,11 @@ inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offse inline int32x4x4_t get_k_offset(int32_t k_offset) { - return - { - { - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset) - } - }; + return {{vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}}; } -inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu) +inline uint8x16_t finalize_quantization_floating_point( + int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu) { const static int32x4_t zero_s32 = vdupq_n_s32(0); @@ -172,18 +124,13 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3 in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to U8 uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_u8 = vmaxq_u8(out_u8, min_u8); out_u8 = vminq_u8(out_u8, max_u8); @@ -192,7 +139,8 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3 return out_u8; } -inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) +inline int8x16_t finalize_quantization_floating_point( + int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) { const static int32x4_t zero_s32 = vdupq_n_s32(0); @@ -209,18 +157,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32 in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to S8 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_s8 = vmaxq_s8(out_s8, min_s8); out_s8 = vminq_s8(out_s8, max_s8); @@ -229,7 +172,8 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32 return out_s8; } -inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) +inline int8x16_t finalize_quantization_floating_point( + int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) { const static int32x4_t zero_s32 = vdupq_n_s32(0); @@ -246,18 +190,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32 in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to S8 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_s8 = vmaxq_s8(out_s8, min_s8); out_s8 = vminq_s8(out_s8, max_s8); @@ -305,81 +244,103 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias) } template -inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, - const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, - typename VT::vtype min_vec, typename VT::vtype max_vec, - int32_t a_offset, int32_t b_offset, int32_t k_offset, - int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound, - int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point) +inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, + const int32_t *vector_sum_row_ptr, + const int32_t *bias_ptr, + Iterator mm_result_it, + Iterator out_it, + const int32x4_t result_offset_s32, + const int32x4_t result_shift_s32, + typename VT::vtype min_vec, + typename VT::vtype max_vec, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + int32_t multiplier, + int32_t shift, + int32_t offset, + int32_t min_bound, + int32_t max_bound, + int window_step_x, + int window_start_x, + int window_end_x, + bool has_a_offset, + bool has_b_offset, + bool has_bias, + bool is_bounded_relu, + bool is_fixed_point) { - int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; - if(!is_fixed_point) + int32x4x4_t offset_term_s32 = {0, 0, 0, 0}; + if (!is_fixed_point) { // Combine quantization offset with other offsets. offset_term_s32 = add_s32(offset_term_s32, result_offset_s32); } - if(has_a_offset && has_b_offset) + if (has_a_offset && has_b_offset) { offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset)); } - if(has_b_offset) + if (has_b_offset) { offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset)); } int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { int32x4x4_t in_s32 = load_results_input(mm_result_it, x); - if(has_a_offset) + if (has_a_offset) { in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); } - if(has_bias) + if (has_bias) { in_s32 = add_s32(in_s32, load(bias_ptr, x)); } - if(!is_fixed_point || has_b_offset) + if (!is_fixed_point || has_b_offset) { in_s32 = add_s32(in_s32, offset_term_s32); } - if(!is_fixed_point) + if (!is_fixed_point) { in_s32 = mul_s32(in_s32, multiplier); } - if(is_fixed_point) + if (is_fixed_point) { - wrapper::vstore(reinterpret_cast(out_it.ptr() + x), - finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu)); + wrapper::vstore( + reinterpret_cast(out_it.ptr() + x), + finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu)); } else { - wrapper::vstore(reinterpret_cast(out_it.ptr() + x), - finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu)); + wrapper::vstore( + reinterpret_cast(out_it.ptr() + x), + finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu)); } } // Compute left-over elements - for(; x < window_end_x; ++x) + for (; x < window_end_x; ++x) { - int32_t in_value = *(reinterpret_cast(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); + int32_t in_value = + *(reinterpret_cast(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); - if(has_a_offset) + if (has_a_offset) { in_value += (*(vector_sum_col_ptr + x) * a_offset); } - if(has_bias) + if (has_bias) { in_value += *(bias_ptr + x); } - if(is_fixed_point) + if (is_fixed_point) { // Finalize and store the result - *reinterpret_cast(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset, - static_cast(min_bound), - static_cast(max_bound), is_bounded_relu); + *reinterpret_cast(out_it.ptr() + x) = + finalize_quantization(in_value, multiplier, shift, offset, static_cast(min_bound), + static_cast(max_bound), is_bounded_relu); } else { @@ -387,75 +348,100 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su in_value = (in_value * multiplier) >> shift; // Bound and store the result - if(is_bounded_relu) + if (is_bounded_relu) { - in_value = static_cast(std::max(min_bound, std::min(max_bound, in_value))); + in_value = static_cast( + std::max(min_bound, std::min(max_bound, in_value))); } - *reinterpret_cast(out_it.ptr() + x) = static_cast(std::max(static_cast(std::numeric_limits::lowest()), - std::min(static_cast(std::numeric_limits::max()), in_value))); + *reinterpret_cast(out_it.ptr() + x) = + static_cast(std::max( + static_cast(std::numeric_limits::lowest()), + std::min(static_cast(std::numeric_limits::max()), in_value))); } } } -inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, - const int32_t *result_multipliers, const int32_t *result_shifts, - const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8, - int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound, - int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point) +inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, + const int32_t *bias_ptr, + Iterator mm_result_it, + Iterator out_it, + const int32_t *result_multipliers, + const int32_t *result_shifts, + const int32x4_t result_offset, + int8x16_t min_s8, + int8x16_t max_s8, + int32_t a_offset, + int32_t offset, + int32_t min_bound, + int32_t max_bound, + int window_step_x, + int window_start_x, + int window_end_x, + bool has_a_offset, + bool has_bias, + bool is_bounded_relu, + bool is_fixed_point) { - int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; - if(!is_fixed_point) + int32x4x4_t offset_term_s32 = {0, 0, 0, 0}; + if (!is_fixed_point) { // Combine quantization offset with other offsets. offset_term_s32 = add_s32(offset_term_s32, result_offset); } int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { int32x4x4_t in_s32 = load_results_input(mm_result_it, x); - if(has_a_offset) + if (has_a_offset) { in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); } - if(has_bias) + if (has_bias) { in_s32 = add_s32(in_s32, load(bias_ptr, x)); } - if(!is_fixed_point) + if (!is_fixed_point) { in_s32 = add_s32(in_s32, offset_term_s32); in_s32 = mul_s32(in_s32, result_multipliers + x); } - if(is_fixed_point) + if (is_fixed_point) { - vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu)); + vst1q_s8(reinterpret_cast(out_it.ptr() + x), + finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), + result_offset, min_s8, max_s8, is_bounded_relu)); } else { - vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu)); + vst1q_s8( + reinterpret_cast(out_it.ptr() + x), + finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu)); } } // Compute left-over elements - for(; x < window_end_x; ++x) + for (; x < window_end_x; ++x) { - int32_t in_value = *(reinterpret_cast(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); + int32_t in_value = + *(reinterpret_cast(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); - if(has_a_offset) + if (has_a_offset) { in_value += (*(vector_sum_col_ptr + x) * a_offset); } - if(has_bias) + if (has_bias) { in_value += *(bias_ptr + x); } - if(is_fixed_point) + if (is_fixed_point) { // Finalize and store the result - *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast(min_bound), static_cast(max_bound), is_bounded_relu); + *(out_it.ptr() + x) = + finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, + static_cast(min_bound), static_cast(max_bound), is_bounded_relu); } else { @@ -463,7 +449,7 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]); // Bound and store the result - if(is_bounded_relu) + if (is_bounded_relu) { in_value = static_cast(std::max(min_bound, std::min(max_bound, in_value))); } @@ -473,10 +459,20 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect } template -void run_offset_contribution_output_stage(const Window &window, - const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, - int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched, - GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point) +void run_offset_contribution_output_stage(const Window &window, + const ITensor *mm_result, + const ITensor *vector_sum_col, + const ITensor *vector_sum_row, + const ITensor *bias, + ITensor *output, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + bool is_vector_sum_col_batched, + GEMMLowpOutputStageInfo output_stage, + bool is_gemm3d, + bool is_bounded_relu, + bool is_fixed_point) { // Semantics of XYZW Explained for each tensor // @@ -516,7 +512,7 @@ void run_offset_contribution_output_stage(const Window &window, Iterator mm_result_it(mm_result, win); Iterator out_it(output, win); - if((a_offset != 0) && (b_offset != 0)) + if ((a_offset != 0) && (b_offset != 0)) { ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); @@ -527,45 +523,52 @@ void run_offset_contribution_output_stage(const Window &window, const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); // Offset in case vector_sum_col is batched in y dimension - const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; + const int vector_sum_col_stride_batch = + is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), - mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + const auto vector_sum_row_ptr = + reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window( + vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), + mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, + k_offset, multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, + window_end_x, true, true, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + const auto vector_sum_row_ptr = + reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window( + vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, + result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, + min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, false, + is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); } } - else if((a_offset == 0) && (b_offset != 0)) + else if ((a_offset == 0) && (b_offset != 0)) { ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); @@ -573,114 +576,139 @@ void run_offset_contribution_output_stage(const Window &window, const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_row_it, bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_row_ptr = + reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window( + nullptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, + out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, + false, true, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_row_it, bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_row_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_row_ptr = + reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window( + nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, + is_fixed_point); + }, + vector_sum_row_it, mm_result_it, out_it); } } - else if((a_offset != 0) && (b_offset == 0)) + else if ((a_offset != 0) && (b_offset == 0)) { ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); // Offset in case vector_sum_col is batched in y dimension - const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; + const int vector_sum_col_stride_batch = + is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window( + vector_sum_col_ptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, + out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, + true, false, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window( + vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, + is_fixed_point); + }, + vector_sum_col_it, mm_result_it, out_it); } } else { - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window(nullptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point); - }, - bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window( + nullptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, + result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, + shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, false, + true, is_bounded_relu, is_fixed_point); + }, + bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window(nullptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point); - }, - mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window( + nullptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec, + max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, + is_fixed_point); + }, + mm_result_it, out_it); } return; } } -void run_offset_contribution_output_stage_symm(const Window &window, - const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, - int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched, - GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point) +void run_offset_contribution_output_stage_symm(const Window &window, + const ITensor *mm_result, + const ITensor *vector_sum_col, + const ITensor *vector_sum_row, + const ITensor *bias, + ITensor *output, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + bool is_vector_sum_col_batched, + GEMMLowpOutputStageInfo output_stage, + bool is_gemm3d, + bool is_bounded_relu, + bool is_fixed_point) { ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset); @@ -690,8 +718,8 @@ void run_offset_contribution_output_stage_symm(const Window &window, const int32_t min_bound = output_stage.gemmlowp_min_bound; const int32_t max_bound = output_stage.gemmlowp_max_bound; - const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data(); - const int32_t *result_shifts = output_stage.gemmlowp_shifts.data(); + const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data(); + const int32_t *result_shifts = output_stage.gemmlowp_shifts.data(); const int32x4_t result_offset_s32 = vdupq_n_s32(offset); const int8x16_t min_s8 = vdupq_n_s8(static_cast(min_bound)); const int8x16_t max_s8 = vdupq_n_s8(static_cast(max_bound)); @@ -708,88 +736,105 @@ void run_offset_contribution_output_stage_symm(const Window &window, Iterator mm_result_it(mm_result, win); Iterator out_it(output, win); - if(a_offset != 0) + if (a_offset != 0) { ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); // Offset in case vector_sum_col is batched in y dimension - const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; + const int vector_sum_col_stride_batch = + is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window_symm( + vector_sum_col_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset, + min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, + is_fixed_point); + }, + vector_sum_col_it, bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window_symm( + vector_sum_col_ptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x, + window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, mm_result_it, out_it); } } else { - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point); - }, - bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm( + nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset, + min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, + is_fixed_point); + }, + bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point); - }, - mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm( + nullptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts, result_offset_s32, + min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x, window_start_x, + window_end_x, false, false, is_bounded_relu, is_fixed_point); + }, + mm_result_it, out_it); } return; } } -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, - int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) +Status validate_arguments(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output, + int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - if(output->data_type() != DataType::QASYMM8) + if (output->data_type() != DataType::QASYMM8) { - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && + b_offset != 0); } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && + output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); @@ -797,7 +842,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); @@ -805,19 +850,21 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) + if (b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (mm_result->dimension(1) * mm_result->dimension(2))); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); TensorShape output_shape = output->tensor_shape(); - if(output_shape.num_dimensions() > 1) + if (output_shape.num_dimensions() > 1) { const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -828,13 +875,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], "mm_result tensor must have the same number of batches of output tensor"); - if(a_offset != 0) + if (a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } @@ -842,7 +891,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->num_dimensions() > 3); } - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output); @@ -852,15 +901,21 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } } // namespace -void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, - const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, - int32_t k, int32_t a_offset, int32_t b_offset, +void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + ITensorInfo *dst, + int32_t k, + int32_t a_offset, + int32_t b_offset, GEMMLowpOutputStageInfo output_stage) { ARM_COMPUTE_UNUSED(vector_sum_row, bias); // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage)); _a_offset = a_offset; _b_offset = b_offset; @@ -868,7 +923,7 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo _output_stage = output_stage; // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { // Check if vector_sum_col_shape should be slidden or not // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 @@ -888,16 +943,24 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo ICpuKernel::configure(win); } -Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, - const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, - int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) +Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output, + int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage) { ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage)); return Status{}; } -void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); @@ -912,14 +975,14 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors PixelValue type_min{}; PixelValue type_max{}; std::tie(type_min, type_max) = get_min_max(dst->info()->data_type()); - int32_t type_min_int = type_min.get(); - int32_t type_max_int = type_max.get(); + int32_t type_min_int = type_min.get(); + int32_t type_max_int = type_max.get(); - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); + const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 && + mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int); + const bool is_bounded_relu = + !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int); // Check if we need to perform fixed point requantization const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; @@ -930,22 +993,25 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors // Check if symmetric per-channel execution const bool is_symm = _output_stage.is_quantized_per_channel; - if(is_symm) + if (is_symm) { - run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage, - reinterpret_as_3d, is_bounded_relu, is_fixed_point); + run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, + _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, + _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); } else { - if(is_signed) + if (is_signed) { - run_offset_contribution_output_stage(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage, - reinterpret_as_3d, is_bounded_relu, is_fixed_point); + run_offset_contribution_output_stage( + window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, + _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); } else { - run_offset_contribution_output_stage(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage, - reinterpret_as_3d, is_bounded_relu, is_fixed_point); + run_offset_contribution_output_stage( + window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, + _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); } } } diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h index 3cb99faee8..af477d4756 100644 --- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -85,7 +86,13 @@ public: * @param[in] b_offset Offset to be added to each element of the matrix B. * @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters. */ - void configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset, + void configure(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + ITensorInfo *dst, + int32_t k, + int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage); /** Static function to check if given info will lead to a valid configuration @@ -94,21 +101,26 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, + static Status validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *dst, + int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: /** Function to use for the particular tensors passed to configure() */ - int32_t _a_offset{ 0 }; - int32_t _b_offset{ 0 }; - int32_t _k_offset{ 0 }; - bool _is_vector_sum_col_batched{ true }; - GEMMLowpOutputStageInfo _output_stage{ GEMMLowpOutputStageInfo() }; + int32_t _a_offset{0}; + int32_t _b_offset{0}; + int32_t _k_offset{0}; + bool _is_vector_sum_col_batched{true}; + GEMMLowpOutputStageInfo _output_stage{GEMMLowpOutputStageInfo()}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp index 3023d93113..eefc294700 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp @@ -28,13 +28,14 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "src/core/AccessWindowStatic.h" -#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include @@ -46,26 +47,35 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) - || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); + ARM_COMPUTE_RETURN_ERROR_ON( + output_stage->gemmlowp_max_bound > + std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON( + output_stage->gemmlowp_min_bound < + std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) || + output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - if(dst->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 || output_stage->output_data_type == DataType::QASYMM8_SIGNED)) + if (dst->data_type() != output_stage->output_data_type && + (output_stage->output_data_type == DataType::QASYMM8 || + output_stage->output_data_type == DataType::QASYMM8_SIGNED)) { ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types"); } @@ -92,24 +102,26 @@ inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_ } template -inline typename std::enable_if::value, - typename wrapper::traits::neon_vector::type>::type - convert_to_8bit(const int16x8x2_t in_s16) +inline + typename std::enable_if::value, typename wrapper::traits::neon_vector::type>::type + convert_to_8bit(const int16x8x2_t in_s16) { return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1])); } template -inline typename std::enable_if::value, - typename wrapper::traits::neon_vector::type>::type - convert_to_8bit(const int16x8x2_t in_s16) +inline typename std::enable_if::value, typename wrapper::traits::neon_vector::type>::type +convert_to_8bit(const int16x8x2_t in_s16) { return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1])); } template -inline typename wrapper::traits::neon_vector::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector::type min, - typename wrapper::traits::neon_vector::type max) +inline typename wrapper::traits::neon_vector::type +finalize_quantization(int32x4x4_t &in_s32, + int32x4_t result_shift_s32, + typename wrapper::traits::neon_vector::type min, + typename wrapper::traits::neon_vector::type max) { // Shift final result (negative value shift right) in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); @@ -118,13 +130,8 @@ inline typename wrapper::traits::neon_vector::type finalize_quantization( in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to S8 or U8 typename wrapper::traits::neon_vector::type out = convert_to_8bit(in_s16); @@ -137,7 +144,10 @@ inline typename wrapper::traits::neon_vector::type finalize_quantization( } // namespace template -void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) { using VectorType = typename wrapper::traits::neon_vector::type; @@ -159,107 +169,105 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, c Iterator in(src, win); Iterator out(dst, win); - if(bias != nullptr) + if (bias != nullptr) { Window win_biases; win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); Iterator bias_i(bias, win_biases); - execute_window_loop(win, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - const int32x4x4_t bias_s32 = + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 12)}}; + + const int32x4x4_t bias_s32 = { + {vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 12)}}; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); + + wrapper::vstore(reinterpret_cast(out.ptr() + x), + finalize_quantization(in_s32, result_shift_s32, min, max)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) { - { - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 12) - } - }; - - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - - // Add the offset terms to GEMM's result and multiply by result_mult_int - scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); - - wrapper::vstore(reinterpret_cast(out.ptr() + x), finalize_quantization(in_s32, result_shift_s32, min, max)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int bias_value = *(reinterpret_cast(bias_i.ptr()) + x); - int in_value = *(reinterpret_cast(in.ptr()) + x); - - // Quantize - in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift; - - // Store the result - *(out.ptr() + x) = static_cast(utility::clamp(in_value, clamp_min, clamp_max)); - } - }, - in, bias_i, out); + const int bias_value = *(reinterpret_cast(bias_i.ptr()) + x); + int in_value = *(reinterpret_cast(in.ptr()) + x); + + // Quantize + in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * + _output_stage->gemmlowp_multiplier) >> + _output_stage->gemmlowp_shift; + + // Store the result + *(out.ptr() + x) = static_cast(utility::clamp(in_value, clamp_min, clamp_max)); + } + }, + in, bias_i, out); } else { - execute_window_loop(win, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - // Add the offset terms to GEMM's result and multiply by result_mult_int - scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); - - wrapper::vstore(reinterpret_cast(out.ptr() + x), finalize_quantization(in_s32, result_shift_s32, min, max)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int in_value = *(reinterpret_cast(in.ptr()) + x); + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 12)}}; + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); + + wrapper::vstore(reinterpret_cast(out.ptr() + x), + finalize_quantization(in_s32, result_shift_s32, min, max)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int in_value = *(reinterpret_cast(in.ptr()) + x); - // Quantize - in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift; + // Quantize + in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> + _output_stage->gemmlowp_shift; - // Store the result - *(out.ptr() + x) = static_cast(utility::clamp(in_value, clamp_min, clamp_max)); - } - }, - in, out); + // Store the result + *(out.ptr() + x) = static_cast(utility::clamp(in_value, clamp_min, clamp_max)); + } + }, + in, out); } } -void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) { ARM_COMPUTE_UNUSED(bias); // Perform validate step @@ -268,10 +276,7 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso // Output auto inizialitation if not yet initialized auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, - bias, - dst, - output_stage)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage)); _output_stage = output_stage; @@ -281,14 +286,17 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso ICpuKernel::configure(win); // Check if we need to clamp the result using min and max - _is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) - && !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) - && _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)))); - if(_output_stage->output_data_type == DataType::QASYMM8) + _is_bounded_relu = + ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) && + !(_output_stage->gemmlowp_min_bound == + std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) && + _output_stage->gemmlowp_max_bound == + std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)))); + if (_output_stage->output_data_type == DataType::QASYMM8) { _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal; } - else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED) + else if (_output_stage->output_data_type == DataType::QASYMM8_SIGNED) { _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal; } @@ -298,7 +306,10 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso } } -Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); return Status{}; @@ -323,4 +334,4 @@ const char *CpuGemmLowpQuantizeDownInt32ScaleKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h index c7813edcd7..33e296b251 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -71,10 +72,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -95,11 +99,14 @@ private: * @param[out] dst Output tensor info * @param[in] window Region on which to execute the kernel. */ - using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window); - QuantizeDownFunctionPtr _func{ nullptr }; - const GEMMLowpOutputStageInfo *_output_stage{ nullptr }; - bool _is_bounded_relu{ false }; + QuantizeDownFunctionPtr _func{nullptr}; + const GEMMLowpOutputStageInfo *_output_stage{nullptr}; + bool _is_bounded_relu{false}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp index 53ca991889..a5c09c9977 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp @@ -29,12 +29,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/NESymm.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NESymm.h" #include @@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ARM_COMPUTE_RETURN_ERROR_ON(min > max); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); @@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const } // namespace template -void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) { const int16x8_t min_s16 = vdupq_n_s16(static_cast(_min)); const int16x8_t max_s16 = vdupq_n_s16(static_cast(_max)); @@ -88,92 +92,92 @@ void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(co Iterator in(src, win_collapsed); Iterator out(dst, win_collapsed); - if(bias != nullptr) + if (bias != nullptr) { Window win_biases; win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); Iterator bias_i(bias, win_biases); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x2_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4) - } - }; + int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 4)}}; - const int32x4x2_t bias_s32 = - { - { - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4) - } - }; + const int32x4x2_t bias_s32 = {{vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4)}}; - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - vst1q_s16(reinterpret_cast(out.ptr()) + x, finalize_quantization_int16(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16)); - } + vst1q_s16(reinterpret_cast(out.ptr()) + x, + finalize_quantization_int16(in_s32, _result_fixedpoint_multiplier, + _result_shift, min_s16, max_s16)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t bias_value = *(reinterpret_cast(bias_i.ptr()) + x); - int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - - // Add bias - in_value += bias_value; - // Finalize and store the result - *(reinterpret_cast(out.ptr()) + x) = finalize_quantization_int16(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast(_min), - static_cast(_max)); - } - }, - in, out, bias_i); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t bias_value = *(reinterpret_cast(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *(reinterpret_cast(out.ptr()) + x) = finalize_quantization_int16( + in_value, _result_fixedpoint_multiplier, _result_shift, static_cast(_min), + static_cast(_max)); + } + }, + in, out, bias_i); } else { - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x2_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4) - } - }; + int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 4)}}; - vst1q_s16(reinterpret_cast(out.ptr()) + x, finalize_quantization_int16(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16)); - } + vst1q_s16(reinterpret_cast(out.ptr()) + x, + finalize_quantization_int16(in_s32, _result_fixedpoint_multiplier, + _result_shift, min_s16, max_s16)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - ARM_COMPUTE_UNUSED(in_value); - // Finalize and store the result - *(reinterpret_cast(out.ptr()) + x) = finalize_quantization_int16(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast(_min), - static_cast(_max)); - } - }, - in, out); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); + ARM_COMPUTE_UNUSED(in_value); + // Finalize and store the result + *(reinterpret_cast(out.ptr()) + x) = finalize_quantization_int16( + in_value, _result_fixedpoint_multiplier, _result_shift, static_cast(_min), + static_cast(_max)); + } + }, + in, out); } } -void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, - int min, int max) +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int min, + int max) { // Perform validate step ARM_COMPUTE_UNUSED(bias, dst); @@ -193,18 +197,21 @@ void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITens // Check if we need to clamp the result using min and max const bool is_bounded_relu = !(min <= -32768 && max >= 32767); - _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal : - &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal; + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal + : &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal; } -Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate( + const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); return Status{}; } -void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h index 681d099695..925788b680 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -48,7 +49,8 @@ namespace kernels * -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16. * */ -class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel +class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel + : public ICpuKernel { public: CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default; @@ -65,17 +67,24 @@ public: * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0. */ - void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0); + void configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int min = 0, + int max = 0); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + static Status + validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -97,13 +106,13 @@ private: * @param[in] window Region on which to execute the kernel. */ using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)( - const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - QuantizeDownFunctionPtr _func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _min{ 0 }; - int _max{ 0 }; + QuantizeDownFunctionPtr _func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _min{0}; + int _max{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp index 27214dcb5a..0e58097073 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp @@ -29,12 +29,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/NEAsymm.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" #include @@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ARM_COMPUTE_RETURN_ERROR_ON(min > max); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); @@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const } // namespace template -void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) { const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift); const int8x16_t min_s8 = vdupq_n_s8(static_cast(_min)); @@ -88,102 +92,102 @@ void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(con Iterator in(src, win_collapsed); Iterator out(dst, win_collapsed); - if(bias != nullptr) + if (bias != nullptr) { Window win_biases; win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); Iterator bias_i(bias, win_biases); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - const int32x4x4_t bias_s32 = + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 12)}}; + + const int32x4x4_t bias_s32 = { + {vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 12)}}; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + vst1q_s8(reinterpret_cast(out.ptr() + x), + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) { - { - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 12) - } - }; - - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - - vst1q_s8(reinterpret_cast(out.ptr() + x), - finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t bias_value = *(reinterpret_cast(bias_i.ptr()) + x); - int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - - // Add bias - in_value += bias_value; - // Finalize and store the result - *reinterpret_cast(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, - static_cast(_min), static_cast(_max), is_bounded_relu); - } - }, - in, out, bias_i); + const int32_t bias_value = *(reinterpret_cast(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *reinterpret_cast(out.ptr() + x) = finalize_quantization( + in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, + static_cast(_min), static_cast(_max), is_bounded_relu); + } + }, + in, out, bias_i); } else { - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - vst1q_s8(reinterpret_cast(out.ptr() + x), - finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - - // Finalize and store the result - *reinterpret_cast(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, - static_cast(_min), static_cast(_max), is_bounded_relu); - } - }, - in, out); + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 12)}}; + + vst1q_s8(reinterpret_cast(out.ptr() + x), + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); + + // Finalize and store the result + *reinterpret_cast(out.ptr() + x) = finalize_quantization( + in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, + static_cast(_min), static_cast(_max), is_bounded_relu); + } + }, + in, out); } } -void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min, + int max) { ARM_COMPUTE_UNUSED(bias); // Perform validate step @@ -205,18 +209,21 @@ void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITenso // Check if we need to clamp the result using min and max const bool is_bounded_relu = !(min <= -128 && max >= 127); - _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal : - &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal; + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal + : &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal; } -Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate( + const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max)); return Status{}; } -void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h index 3e615b935e..6a67ba4f19 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -49,7 +50,8 @@ namespace kernels * -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED. * */ -class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel +class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel + : public ICpuKernel { public: CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default; @@ -67,17 +69,25 @@ public: * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED, * Along with @p min, this value can be used to implement "rectified linear unit" activation functions */ - void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); + void configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min = 0, + int max = 0); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + static Status + validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -99,14 +109,14 @@ private: * @param[in] window Region on which to execute the kernel. */ using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)( - const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - QuantizeDownFunctionPtr _func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _result_offset_after_shift{ 0 }; - int _min{ 0 }; - int _max{ 0 }; + QuantizeDownFunctionPtr _func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _result_offset_after_shift{0}; + int _min{0}; + int _max{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index e49fd29115..e3dd2240ca 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -29,12 +29,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/NEAsymm.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" #include @@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ARM_COMPUTE_RETURN_ERROR_ON(min > max); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); @@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const } // namespace template -void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) { const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift); const uint8x16_t min_u8 = vdupq_n_u8(static_cast(_min)); @@ -89,98 +93,102 @@ void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(co Iterator in(src, win_collapsed); Iterator out(dst, win_collapsed); - if(bias != nullptr) + if (bias != nullptr) { Window win_biases; win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); Iterator bias_i(bias, win_biases); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - const int32x4x4_t bias_s32 = + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 12)}}; + + const int32x4x4_t bias_s32 = { + {vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 12)}}; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + vst1q_u8(out.ptr() + x, + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) { - { - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 12) - } - }; - - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - - vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t bias_value = *(reinterpret_cast(bias_i.ptr()) + x); - int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - - // Add bias - in_value += bias_value; - // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast(_min), static_cast(_max), is_bounded_relu); - } - }, - in, out, bias_i); + const int32_t bias_value = *(reinterpret_cast(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, + _result_offset_after_shift, static_cast(_min), + static_cast(_max), is_bounded_relu); + } + }, + in, out, bias_i); } else { - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - - // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast(_min), static_cast(_max), is_bounded_relu); - } - }, - in, out); + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast(in.ptr()) + x + 12)}}; + + vst1q_u8(out.ptr() + x, + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); + + // Finalize and store the result + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, + _result_offset_after_shift, static_cast(_min), + static_cast(_max), is_bounded_relu); + } + }, + in, out); } } -void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min, + int max) { ARM_COMPUTE_UNUSED(bias); // Perform validate step @@ -202,18 +210,21 @@ void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITens // Check if we need to clamp the result using min and max const bool is_bounded_relu = !(min <= 0 && max >= 255); - _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal : - &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal; + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal + : &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal; } -Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate( + const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max)); return Status{}; } -void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); @@ -233,4 +244,4 @@ const char *CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::name() c } } // namespace kernels } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h index b773fdfdcf..45bd742a70 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -49,7 +50,8 @@ namespace kernels * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. * */ -class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel +class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + : public ICpuKernel { public: CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default; @@ -67,17 +69,25 @@ public: * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, * Along with @p min, this value can be used to implement "rectified linear unit" activation functions */ - void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); + void configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min = 0, + int max = 0); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + static Status + validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -93,14 +103,14 @@ private: * @param[in] window Region on which to execute the kernel. */ using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)( - const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - QuantizeDownFunctionPtr _func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _result_offset_after_shift{ 0 }; - int _min{ 0 }; - int _max{ 0 }; + QuantizeDownFunctionPtr _func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _result_offset_after_shift{0}; + int _min{0}; + int _max{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp index 6399ebbef4..fb1b70b91f 100644 --- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp +++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEFixedPoint.h" #include "src/cpu/kernels/gemm_matrix_add/list.h" namespace arm_compute { @@ -40,24 +41,12 @@ namespace kernels { namespace { -static const std::vector available_kernels = -{ - { - "neon_fp32_gemm_matrix_add", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F32); - }, - REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add) - }, - { - "neon_fp16_gemm_matrix_add", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F16) && data.isa.fp16; - }, - REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add) - }, +static const std::vector available_kernels = { + {"neon_fp32_gemm_matrix_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)}, + {"neon_fp16_gemm_matrix_add", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)}, }; } // namespace @@ -71,7 +60,8 @@ void CpuGemmMatrixAdditionKernel::configure(const ITensorInfo *src, ITensorInfo ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta)); _beta = beta; - const auto uk = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuGemmMatrixAdditionKernel::get_implementation( + DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _func = uk->ukernel; // Configure kernel window @@ -87,7 +77,7 @@ Status CpuGemmMatrixAdditionKernel::validate(const ITensorInfo *src, const ITens ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -105,7 +95,7 @@ void CpuGemmMatrixAdditionKernel::run_op(ITensorPack &tensors, const Window &win const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - if(_beta != 0.0f) + if (_beta != 0.0f) { (*_func)(src, dst, window, _beta); } @@ -116,7 +106,8 @@ const char *CpuGemmMatrixAdditionKernel::name() const return "CpuGemmMatrixAdditionKernel"; } -const std::vector &CpuGemmMatrixAdditionKernel::get_available_kernels() +const std::vector & +CpuGemmMatrixAdditionKernel::get_available_kernels() { return available_kernels; } diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h index cbc5b53087..5e12f1dcbd 100644 --- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h +++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h @@ -75,7 +75,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; static const std::vector &get_available_kernels(); @@ -89,8 +89,8 @@ private: * @param[in] beta Weight of matrix C */ /** Matrix addition function to use for the particular tensor types passed to configure() */ - GemmMatrixAddKernelPtr _func{ nullptr }; - float _beta{ 0.f }; + GemmMatrixAddKernelPtr _func{nullptr}; + float _beta{0.f}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp index 03b372efd4..beccd94844 100644 --- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp +++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp @@ -26,10 +26,11 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" +#include "arm_compute/core/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/gemm_matrix_mul/list.h" @@ -42,27 +43,20 @@ namespace kernels { namespace { -static const std::vector available_kernels = -{ - { - "neon_fp32_gemm_matrix_mul", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F32); - }, - REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul) - }, - { - "neon_fp16_gemm_matrix_mul", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F16) && data.isa.fp16; - }, - REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul) - }, +static const std::vector available_kernels = { + {"neon_fp32_gemm_matrix_mul", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)}, + {"neon_fp16_gemm_matrix_mul", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)}, }; -inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info) +inline Status validate_arguments(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_UNUSED(alpha); @@ -70,11 +64,11 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst); - if(!is_interleaved) + if (!is_interleaved) { ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1)); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1)); @@ -90,28 +84,31 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); /* Interleave */ - TensorShape tensor_shape0{ lhs->tensor_shape() }; + TensorShape tensor_shape0{lhs->tensor_shape()}; tensor_shape0.set(0, k); tensor_shape0.set(1, m); const TensorInfo tensor_info0 = lhs->clone()->set_tensor_shape(tensor_shape0); - const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height)); + const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape( + misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0); - if(n != 0) /* Transpose */ + if (n != 0) /* Transpose */ { - TensorShape tensor_shape1{ rhs->tensor_shape() }; + TensorShape tensor_shape1{rhs->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width)); + const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = + rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape( + tensor_info1, mult_transpose1xW_width)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - if(n != 0) + if (n != 0) { ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast(n)); } @@ -125,12 +122,17 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, } // namespace -void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info) +void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, + const ITensorInfo *rhs, + ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); // dst tensor auto inizialitation if not yet initialized - TensorShape tensor_shape{ lhs->tensor_shape() }; + TensorShape tensor_shape{lhs->tensor_shape()}; tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0)); tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1)); @@ -146,7 +148,7 @@ void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITenso // Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication const bool is_dst_vector = (dst->dimension(1) == 1); - if(is_dst_vector) + if (is_dst_vector) { const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32; @@ -157,17 +159,23 @@ void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITenso constexpr unsigned int num_elems_processed_per_iteration_x = 8; constexpr unsigned int num_elems_processed_per_iteration_y = 4; - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); } - const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ lhs->data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation( + DataTypeISASelectorData{lhs->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _func = uk->ukernel; ICPPKernel::configure(win); } -Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, +Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + float alpha, + bool is_interleaved, const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info)); @@ -195,7 +203,8 @@ const char *CpuGemmMatrixMultiplyKernel::name() const return "CpuGemmMatrixMultiplyKernel"; } -const std::vector &CpuGemmMatrixMultiplyKernel::get_available_kernels() +const std::vector & +CpuGemmMatrixMultiplyKernel::get_available_kernels() { return available_kernels; } diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h index a7dfec87bd..765fcb8275 100644 --- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h +++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h @@ -42,7 +42,8 @@ namespace kernels class CpuGemmMatrixMultiplyKernel : public ICpuKernel { private: - using GemmMatrixMulKernelPtr = std::add_pointer::type; + using GemmMatrixMulKernelPtr = std::add_pointer::type; public: struct GemmMatrixMulKernel @@ -67,17 +68,27 @@ public: * @param[in] is_interleaved (Optional) True if lhs and rhs have been reshaped respectively using @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how @p lhs and @p rhs have been reshaped */ - void configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); + void configure(const ITensorInfo *lhs, + const ITensorInfo *rhs, + ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixMultiplyKernel * * Similar to @ref CpuGemmMatrixMultiplyKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info); + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; static const std::vector &get_available_kernels(); @@ -94,8 +105,8 @@ private: */ /** Matrix multiply function to use for the particular tensor types passed to configure() */ - GemmMatrixMulKernelPtr _func{ nullptr }; - float _alpha{ 1.f }; + GemmMatrixMulKernelPtr _func{nullptr}; + float _alpha{1.f}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp index 62d5d5f5e9..c47746bc4b 100644 --- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp +++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp @@ -24,9 +24,10 @@ #include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -63,9 +64,10 @@ Status CpuGemmTranspose1xWKernel::validate(const ITensorInfo *src, const ITensor ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_transpose1xW_with_element_size_shape(*src)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + compute_transpose1xW_with_element_size_shape(*src)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); } @@ -107,25 +109,28 @@ void CpuGemmTranspose1xWKernel::run_op(ITensorPack &tensors, const Window &windo const size_t out_stride = dst->info()->strides_in_bytes()[1]; const size_t vector_size = 16 / element_size; - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8_t *in_ptr = in.ptr(); - uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride; - - for(size_t k = 0; k < vector_size; ++k) + execute_window_loop( + window, + [&](const Coordinates &id) { - // If the src width is not multiple of W, we fill the reference with 0s - if((id.x() + k) >= in_width) - { - std::memset(out_ptr + k * element_size, 0, element_size); - } - else + const uint8_t *in_ptr = in.ptr(); + uint8_t *const out_ptr = + out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride; + + for (size_t k = 0; k < vector_size; ++k) { - std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size); + // If the src width is not multiple of W, we fill the reference with 0s + if ((id.x() + k) >= in_width) + { + std::memset(out_ptr + k * element_size, 0, element_size); + } + else + { + std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size); + } } - } - }, - in, out); + }, + in, out); } const char *CpuGemmTranspose1xWKernel::name() const diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h index 0ca92641b7..4b834b2cc6 100644 --- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h +++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h @@ -88,7 +88,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp index 9ac291549b..55ac7c5192 100644 --- a/src/cpu/kernels/CpuIm2ColKernel.cpp +++ b/src/cpu/kernels/CpuIm2ColKernel.cpp @@ -29,13 +29,13 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - #include #include #include @@ -51,26 +51,34 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups, + unsigned int input_pad_right) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias); ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon"); // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions - const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - const unsigned total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right(); + const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); + const unsigned total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right(); const unsigned total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom(); ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height)); - if(output->total_size() > 0) + if (output->total_size() > 0) { - TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right)); + TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape( + input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -106,14 +114,14 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, // This for loop linearize a volume with 3 slices. This allows: // 1) to reduce the iterations of the outer for loop "d" // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs - for(; d <= (kernel_depth - 3); d += 3) + for (; d <= (kernel_depth - 3); d += 3) { - for(int y = top_left_y; y < y_e; y += dilation_y) + for (int y = top_left_y; y < y_e; y += dilation_y) { - if((y < 0 || y >= input_h) && has_pads) + if ((y < 0 || y >= input_h) && has_pads) { // All the values will be the offset (will be zeros when not quantized) - for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) { *(out_ptr + 0 * kernel_size2) = pad_value; *(out_ptr + 1 * kernel_size2) = pad_value; @@ -122,9 +130,9 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, } else { - for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) { - if((x < 0 || x >= input_w) && has_pads) + if ((x < 0 || x >= input_w) && has_pads) { *(out_ptr + 0 * kernel_size2) = pad_value; *(out_ptr + 1 * kernel_size2) = pad_value; @@ -132,9 +140,12 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, } else { - *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x))); - *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x))); - *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast( + in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast( + in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast( + in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x))); } } } @@ -143,11 +154,11 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, } // Left over - for(; d < kernel_depth; d++) + for (; d < kernel_depth; d++) { - for(int y = top_left_y; y < y_e; y += dilation_y) + for (int y = top_left_y; y < y_e; y += dilation_y) { - if((y < 0 || y >= input_h) && has_pads) + if ((y < 0 || y >= input_h) && has_pads) { // All the values will be the offset (will be zeros when not quantized) memset(static_cast(out_ptr), pad_value, kernel_width * sizeof(T)); @@ -155,15 +166,16 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, } else { - for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) { - if((x < 0 || x >= input_w) && has_pads) + if ((x < 0 || x >= input_w) && has_pads) { *out_ptr = pad_value; } else { - *out_ptr = *(reinterpret_cast(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x))); + *out_ptr = *(reinterpret_cast( + in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x))); } } } @@ -171,7 +183,7 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, } // Append 1 if the convolution layer has biases - if(has_bias) + if (has_bias) { *out_ptr = static_cast(1); } @@ -198,36 +210,39 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, const int end_y = start_y + kernel_height * dilation_y; const int pad_quant = kernel_width * input_c; const int element_size = static_cast(sizeof(T)); - if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size)) + if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && + (input_stride_y == input_c * element_size)) { - for(int y = start_y; y < end_y; y += dilation_y) + for (int y = start_y; y < end_y; y += dilation_y) { //optimized for no dilation and no boundary pixels - memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size); + memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + start_x * input_stride_y)), + input_c * kernel_width * element_size); out_ptr += input_c * kernel_width; } } else { - for(int y = start_y; y < end_y; y += dilation_y) + for (int y = start_y; y < end_y; y += dilation_y) { - if(y < 0 || y >= input_h) + if (y < 0 || y >= input_h) { memset(static_cast(out_ptr), pad_value, pad_quant * element_size); out_ptr += pad_quant; } - else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size) + else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size) { - for(int x = start_x; x < end_x; x += dilation_x) + for (int x = start_x; x < end_x; x += dilation_x) { - if(x < 0 || x >= input_w) + if (x < 0 || x >= input_w) { memset(static_cast(out_ptr), pad_value, input_c * element_size); out_ptr += input_c; } else { - memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size); + memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + x * input_stride_y)), + input_c * element_size); out_ptr += input_c; } } @@ -235,13 +250,14 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, else { //optimized for no dilation and no boundary pixels - memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size); + memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + start_x * input_stride_y)), + input_c * kernel_width * element_size); out_ptr += input_c * kernel_width; } } } // Append 1 if the convolution layer has biases - if(has_bias) + if (has_bias) { *out_ptr = static_cast(1); } @@ -271,12 +287,13 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, const int element_size = static_cast(sizeof(T)); const int channel_chunk_size = input_c * element_size; - if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == channel_chunk_size)) + if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && + (input_stride_y == channel_chunk_size)) { - for(int y = start_y; y < end_y; y += dilation_y) + for (int y = start_y; y < end_y; y += dilation_y) { const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y); - for(int e = 0; e < kernel_width; e++) + for (int e = 0; e < kernel_width; e++) { memcpy(out_ptr, reinterpret_cast(offset_ptr + e * channel_chunk_size), channel_chunk_size); out_ptr += input_c + pad_right; @@ -285,25 +302,26 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, } else { - for(int y = start_y; y < end_y; y += dilation_y) + for (int y = start_y; y < end_y; y += dilation_y) { - if(y < 0 || y >= input_h) + if (y < 0 || y >= input_h) { memset(static_cast(out_ptr), pad_value, pad_quant * element_size); out_ptr += pad_quant; } - else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size) + else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size) { - for(int x = start_x; x < end_x; x += dilation_x) + for (int x = start_x; x < end_x; x += dilation_x) { - if(x < 0 || x >= input_w) + if (x < 0 || x >= input_w) { memset(static_cast(out_ptr), pad_value, (input_c + pad_right) * element_size); out_ptr += input_c + pad_right; } else { - memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + x * input_stride_y)), channel_chunk_size); + memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + x * input_stride_y)), + channel_chunk_size); out_ptr += input_c + pad_right; } } @@ -311,16 +329,17 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, else { const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y); - for(int e = 0; e < kernel_width; e++) + for (int e = 0; e < kernel_width; e++) { - memcpy(out_ptr, reinterpret_cast(offset_ptr + e * channel_chunk_size), channel_chunk_size); + memcpy(out_ptr, reinterpret_cast(offset_ptr + e * channel_chunk_size), + channel_chunk_size); out_ptr += input_c + pad_right; } } } } // Append 1 if the convolution layer has biases - if(has_bias) + if (has_bias) { *out_ptr = static_cast(1); } @@ -348,7 +367,8 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window const int pad_top = _conv_info.pad_top(); const int stride_x = _conv_info.stride().first; const int stride_y = _conv_info.stride().second; - const int pad_value = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0; + const int pad_value = + is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0; Window window_in_out(window); // The first three dimensions of the input and output are increased by the inner loops @@ -361,84 +381,57 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window Iterator out(dst, window_in_out); execute_window_loop( - window, [&](const Coordinates & id) - { - const int start_w = id[width_idx] * stride_x - pad_left; - const int start_h = id[height_idx] * stride_y - pad_top; + window, + [&](const Coordinates &id) + { + const int start_w = id[width_idx] * stride_x - pad_left; + const int start_h = id[height_idx] * stride_y - pad_top; - // Get pointers - const uint8_t *const input_ptr = in.ptr(); - auto output_ptr = reinterpret_cast(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y()); + // Get pointers + const uint8_t *const input_ptr = in.ptr(); + auto output_ptr = + reinterpret_cast(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * + dst->info()->strides_in_bytes().y()); - // Linearize volume - if(is_nchw) - { - linearize_volume_nchw(input_ptr, - output_ptr, - _has_bias, - start_w, - start_h, - _kernel_width, - _kernel_height, - input_c, - input_w, - input_h, - input_stride_x, - input_stride_y, - input_stride_z, - pad_value, - _dilation.x(), - _dilation.y()); - } - else - { - if(_input_pad_right > 0) + // Linearize volume + if (is_nchw) { - linearize_volume_nhwc(input_ptr, - output_ptr, - _has_bias, - start_w, - start_h, - _kernel_width, - _kernel_height, - input_w, - input_h, - input_c, - input_stride_y, - input_stride_z, - pad_value, - _dilation.x(), - _dilation.y(), - _input_pad_right); + linearize_volume_nchw( + input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_c, input_w, + input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y()); } else { - linearize_volume_nhwc(input_ptr, - output_ptr, - _has_bias, - start_w, - start_h, - _kernel_width, - _kernel_height, - input_w, - input_h, - input_c, - input_stride_y, - input_stride_z, - pad_value, - _dilation.x(), - _dilation.y()); + if (_input_pad_right > 0) + { + linearize_volume_nhwc(input_ptr, output_ptr, _has_bias, start_w, start_h, + _kernel_width, _kernel_height, input_w, input_h, input_c, + input_stride_y, input_stride_z, pad_value, _dilation.x(), + _dilation.y(), _input_pad_right); + } + else + { + linearize_volume_nhwc( + input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_w, + input_h, input_c, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y()); + } } - } - }, - in, out); + }, + in, out); } -void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right) +void CpuIm2ColKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups, + unsigned int input_pad_right) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); ARM_COMPUTE_UNUSED(num_groups); _data_layout = src->data_layout(); @@ -451,31 +444,34 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const _kernel_height = kernel_dims.height; _input_pad_right = input_pad_right; _dilation = dilation; - _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), - _kernel_width, _kernel_height, - _conv_info, _dilation); + _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), _kernel_width, + _kernel_height, _conv_info, _dilation); _has_bias = has_bias; - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { - switch(src->data_type()) + switch (src->data_type()) { case DataType::F32: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col + : &CpuIm2ColKernel::run_im2col; break; #if defined(ARM_COMPUTE_ENABLE_BF16) case DataType::BFLOAT16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col + : &CpuIm2ColKernel::run_im2col; break; #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col + : &CpuIm2ColKernel::run_im2col; break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::QASYMM8_SIGNED: case DataType::QASYMM8: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col + : &CpuIm2ColKernel::run_im2col; break; default: ARM_COMPUTE_ERROR("Data type not supported"); @@ -484,26 +480,31 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const } else { - switch(src->data_type()) + switch (src->data_type()) { case DataType::F32: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col + : &CpuIm2ColKernel::run_im2col; break; #if defined(ARM_COMPUTE_ENABLE_BF16) case DataType::BFLOAT16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col + : &CpuIm2ColKernel::run_im2col; break; #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col + : &CpuIm2ColKernel::run_im2col; break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::QASYMM8: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col + : &CpuIm2ColKernel::run_im2col; break; case DataType::QASYMM8_SIGNED: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col + : &CpuIm2ColKernel::run_im2col; break; default: ARM_COMPUTE_ERROR("Data type not supported"); @@ -512,11 +513,13 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const } // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false, num_groups, _input_pad_right))); + auto_init_if_empty( + *dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, + false, num_groups, _input_pad_right))); - std::pair convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), - kernel_dims.width, kernel_dims.height, - conv_info, dilation); + std::pair convolved_dims = + scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), kernel_dims.width, kernel_dims.height, + conv_info, dilation); Window win = calculate_max_window(*src, Steps()); win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1)); @@ -526,10 +529,17 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const ICpuKernel::configure(win); } -Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right) +Status CpuIm2ColKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups, + unsigned int input_pad_right) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); return Status{}; } diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h index d133f8dc2d..2cb26179ce 100644 --- a/src/cpu/kernels/CpuIm2ColKernel.h +++ b/src/cpu/kernels/CpuIm2ColKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_IM2COL_KERNEL_H #include "arm_compute/core/Size2D.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -78,16 +79,28 @@ public: * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported * @param[in] input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0); + void configure(const ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1, + unsigned int input_pad_right = 0); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuIm2ColKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1, + unsigned int input_pad_right = 0); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; @@ -117,15 +130,15 @@ private: */ using Im2ColFunctionPtr = void (CpuIm2ColKernel::*)(const ITensor *src, ITensor *dst, const Window &window); - Im2ColFunctionPtr _func{ nullptr }; + Im2ColFunctionPtr _func{nullptr}; std::pair _convolved_dims{}; PadStrideInfo _conv_info{}; - unsigned int _kernel_width{ 0 }; - unsigned int _kernel_height{ 0 }; - unsigned int _input_pad_right{ 0 }; - bool _has_bias{ false }; - Size2D _dilation{ 1U, 1U }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; + unsigned int _kernel_width{0}; + unsigned int _kernel_height{0}; + unsigned int _input_pad_right{0}; + bool _has_bias{false}; + Size2D _dilation{1U, 1U}; + DataLayout _data_layout{DataLayout::UNKNOWN}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h index 39adc9af7c..b7daa4d583 100644 --- a/src/cpu/kernels/CpuKernelSelectionTypes.h +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H #include "arm_compute/core/Types.h" + #include "src/common/cpuinfo/CpuIsaInfo.h" namespace arm_compute @@ -78,10 +79,10 @@ struct DepthwiseConv2dNativeDataTypeISASelectorData struct ActivationDataTypeISASelectorData { - DataType dt; - const CPUModel &cpumodel; - const cpuinfo::CpuIsaInfo &isa; - const ActivationFunction f; + DataType dt; + const CPUModel &cpumodel; + const cpuinfo::CpuIsaInfo &isa; + const ActivationFunction f; }; struct CpuAddKernelDataTypeISASelectorData @@ -99,15 +100,19 @@ struct ScaleKernelDataTypeISASelectorData }; // Selector pointer types -using DataTypeISASelectorPtr = std::add_pointer::type; -using DataTypeDataLayoutSelectorPtr = std::add_pointer::type; -using PoolDataTypeISASelectorPtr = std::add_pointer::type; -using ElementwiseDataTypeISASelectorPtr = std::add_pointer::type; -using DepthwiseConv2dNativeDataTypeISASelectorPtr = std::add_pointer::type; -using CastDataTypeISASelectorDataPtr = std::add_pointer::type; -using ActivationDataTypeISASelectorDataPtr = std::add_pointer::type; -using CpuAddKernelDataTypeISASelectorDataPtr = std::add_pointer::type; -using ScaleKernelDataTypeISASelectorDataPtr = std::add_pointer::type; +using DataTypeISASelectorPtr = std::add_pointer::type; +using DataTypeDataLayoutSelectorPtr = std::add_pointer::type; +using PoolDataTypeISASelectorPtr = std::add_pointer::type; +using ElementwiseDataTypeISASelectorPtr = std::add_pointer::type; +using DepthwiseConv2dNativeDataTypeISASelectorPtr = + std::add_pointer::type; +using CastDataTypeISASelectorDataPtr = std::add_pointer::type; +using ActivationDataTypeISASelectorDataPtr = + std::add_pointer::type; +using CpuAddKernelDataTypeISASelectorDataPtr = + std::add_pointer::type; +using ScaleKernelDataTypeISASelectorDataPtr = + std::add_pointer::type; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp index 7d077c75bf..bcaa76b99b 100644 --- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp +++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp @@ -24,11 +24,12 @@ #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/maxunpool/list.h" @@ -43,50 +44,43 @@ using namespace misc::shape_calculator; namespace { -static const std::vector available_kernels = -{ - { - "neon_fp32_maxunpooling", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(neon_fp32_maxunpooling) - }, - { - "neon_fp16_maxunpooling", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_maxunpooling) - }, - { - "neon_qu8_maxunpooling", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling) - }, - { - "neon_qs8_maxunpooling", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling) - }, +static const std::vector available_kernels = { + {"neon_fp32_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(neon_fp32_maxunpooling)}, + {"neon_fp16_maxunpooling", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_maxunpooling)}, + {"neon_qu8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)}, + {"neon_qs8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)}, }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, indices); - int pool_stride_x = 0; - int pool_stride_y = 0; - PoolingType pool_type = pool_info.pool_type; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + int pool_stride_x = 0; + int pool_stride_y = 0; + PoolingType pool_type = pool_info.pool_type; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_size_x = pool_info.pool_size.width; - const int pool_size_y = pool_info.pool_size.height; + const int pool_size_x = pool_info.pool_size.width; + const int pool_size_y = pool_info.pool_size.height; const Size2D pool_size(pool_size_x, pool_size_y); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, + "Pooling indices only supported for MAX pooling method"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); @@ -96,13 +90,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *indices, co } } // namespace -void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info) +void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, + const ITensorInfo *indices, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, indices); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, indices, dst, pool_info)); ARM_COMPUTE_UNUSED(indices); - const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation( + DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _run_method = uk->ukernel; @@ -113,7 +111,10 @@ void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, const ITensor ICpuKernel::configure(window); } -Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info) +Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, indices, dst, pool_info)); diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h index d0c13471c8..5a641a2bea 100644 --- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h +++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h @@ -37,7 +37,8 @@ namespace kernels class CpuMaxUnpoolingLayerKernel : public ICpuKernel { private: - using MaxUnpoolingUKernelPtr = std::add_pointer::type; + using MaxUnpoolingUKernelPtr = std::add_pointer::type; public: /** Default constructor */ @@ -56,7 +57,8 @@ public: * @param[out] dst Destination tensor. Data types supported: Same as @p src * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ - void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); + void + configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref CpuMaxUnpoolingLayerKernel * * @param[in] src Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -66,7 +68,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; @@ -83,7 +88,7 @@ public: const char *name() const override; private: - MaxUnpoolingUKernelPtr _run_method{ nullptr }; + MaxUnpoolingUKernelPtr _run_method{nullptr}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp index b73d2bdf73..ba086e3ac6 100644 --- a/src/cpu/kernels/CpuMulKernel.cpp +++ b/src/cpu/kernels/CpuMulKernel.cpp @@ -25,23 +25,24 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" + #include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NESymm.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" #include namespace { #if defined(ENABLE_FP32_KERNELS) - static constexpr size_t default_mws_N1_fp32_neon = 22447; - static constexpr size_t default_mws_V1_fp32_neon = 38982; +static constexpr size_t default_mws_N1_fp32_neon = 22447; +static constexpr size_t default_mws_V1_fp32_neon = 38982; #endif /* ENABLE_FP32_KERNELS */ - static constexpr size_t default_mws_other_platforms_1d_tensor = 10240; -} +static constexpr size_t default_mws_other_platforms_1d_tensor = 10240; +} // namespace namespace arm_compute { namespace cpu @@ -54,29 +55,38 @@ const float scale255_constant = 1.f / 255.f; const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant); const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f); -inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +inline Status validate_arguments(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) { ARM_COMPUTE_UNUSED(overflow_policy); ARM_COMPUTE_UNUSED(rounding_policy); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, + DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, + DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::S32, DataType::F16, DataType::F32); - if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type())) + if (is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, + "ConvertPolicy cannot be WRAP if datatype is quantized"); } - if(dst->total_size() > 0) + if (dst->total_size() > 0) { const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // clang-format off ARM_COMPUTE_RETURN_ERROR_ON_MSG( @@ -88,13 +98,17 @@ inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32) , "Invalid data type combination"); // clang-format on - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && + scale != 1.f, + "Unsupported scale for QSYMM16 inputs and S32 dst"); } - if(std::abs(scale - scale255_constant) < 0.00001f) + if (std::abs(scale - scale255_constant) < 0.00001f) { - ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32, + ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && + rounding_policy != RoundingPolicy::TO_NEAREST_EVEN); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && + dst->data_type() == DataType::S32, "Scale == 1/255 is not supported if input and dst are of data type S32"); } else @@ -107,7 +121,8 @@ inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 // Moreover, it will be negative as we deal with 1/2^n - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), + "Scale value not supported (Should be 1/(2^n) or 1/255"); } return Status{}; @@ -168,9 +183,9 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); - const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; + const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset}; - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -178,7 +193,7 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); // Clear X Dimension on execution window as we handle manually non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -190,52 +205,52 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor using ExactTagType = typename wrapper::traits::neon_vector::tag_type; execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo); - const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo); + const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - const float32x4x4_t out_f32x4x4 = + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - // Quantize dst - const auto result = vquantize(out_f32x4x4, tmp_qua_info); - wrapper::vstore(output_ptr + x, result); - } + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo); + const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo); + + const float32x4x4_t out_f32x4x4 = { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + // Quantize dst + const auto result = vquantize(out_f32x4x4, tmp_qua_info); + wrapper::vstore(output_ptr + x, result); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - const T src1 = *(non_broadcast_input_ptr + x); - const float tmp_in1 = Qasymm8QuantizationHelper::dequantize(src1, non_broadcast_qinfo); - const float tmp_in2 = Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); - const float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst - const auto tmp_qua = Qasymm8QuantizationHelper::quantize(tmp_f, tmp_qua_info); - *(output_ptr + x) = tmp_qua; - } - }, - broadcast_input, non_broadcast_input, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Dequantize inputs + const T src1 = *(non_broadcast_input_ptr + x); + const float tmp_in1 = Qasymm8QuantizationHelper::dequantize(src1, non_broadcast_qinfo); + const float tmp_in2 = Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); + const float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst + const auto tmp_qua = Qasymm8QuantizationHelper::quantize(tmp_f, tmp_qua_info); + *(output_ptr + x) = tmp_qua; + } + }, + broadcast_input, non_broadcast_input, dst); } else { @@ -251,56 +266,59 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const auto input1_q = wrapper::vloadq(input1_ptr + x); - const auto input2_q = wrapper::vloadq(input2_ptr + x); - - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); - const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); - const float32x4x4_t out_f32x4x4 = + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - // Quantize dst - const auto result = vquantize(out_f32x4x4, tmp_qua_info); - wrapper::vstore(output_ptr + x, result); - } + const auto input1_q = wrapper::vloadq(input1_ptr + x); + const auto input2_q = wrapper::vloadq(input2_ptr + x); + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); + const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + + const float32x4x4_t out_f32x4x4 = { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + // Quantize dst + const auto result = vquantize(out_f32x4x4, tmp_qua_info); + wrapper::vstore(output_ptr + x, result); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - const T src1 = *(input1_ptr + x); - const T src2 = *(input2_ptr + x); - const float tmp_in1 = Qasymm8QuantizationHelper::dequantize(src1, input1_qua_info); - const float tmp_in2 = Qasymm8QuantizationHelper::dequantize(src2, input2_qua_info); - const float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst - const auto tmp_qua = Qasymm8QuantizationHelper::quantize(tmp_f, tmp_qua_info); - *(output_ptr + x) = tmp_qua; - } - }, - input1, input2, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Dequantize inputs + const T src1 = *(input1_ptr + x); + const T src2 = *(input2_ptr + x); + const float tmp_in1 = Qasymm8QuantizationHelper::dequantize(src1, input1_qua_info); + const float tmp_in2 = Qasymm8QuantizationHelper::dequantize(src2, input2_qua_info); + const float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst + const auto tmp_qua = Qasymm8QuantizationHelper::quantize(tmp_f, tmp_qua_info); + *(output_ptr + x) = tmp_qua; + } + }, + input1, input2, dst); } } -bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale) +bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + float scale) { const auto iq0 = src0->quantization_info().uniform(); const auto iq1 = src1->quantization_info().uniform(); @@ -308,7 +326,7 @@ bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale; - if(multiplier < -8191.f || multiplier > 8191.f) + if (multiplier < -8191.f || multiplier > 8191.f) { //The multiplier cannot be stored as a 14.18 signed fixed-point number return false; @@ -318,7 +336,7 @@ bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo const auto max_result = multiplier * (256) * (256) + offset_out; - if(max_result > 8191.f) + if (max_result > 8191.f) { //It might not be possible to store the result as a 14.18 signed fixed-point number. return false; @@ -366,7 +384,7 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d const auto out_offset_14p18 = static_cast(out_offset * two_pwr18i); const auto multiplier_14p18 = static_cast(multiplier * two_pwr18f); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { // Prefix: a = non-broadcast, b = broadcast. @@ -392,78 +410,76 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d Iterator out_it(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto a_ptr = reinterpret_cast(a_input_it.ptr()); - const auto b_ptr = reinterpret_cast(b_input_it.ptr()); - const auto out_ptr = reinterpret_cast(out_it.ptr()); - - const auto b_val = *b_ptr; - const auto b_offseted_32p0 = static_cast(b_val - b_offset_16p0); - const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag()); + win, + [&](const Coordinates &) + { + const auto a_ptr = reinterpret_cast(a_input_it.ptr()); + const auto b_ptr = reinterpret_cast(b_input_it.ptr()); + const auto out_ptr = reinterpret_cast(out_it.ptr()); - const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag()); - const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag()); + const auto b_val = *b_ptr; + const auto b_offseted_32p0 = static_cast(b_val - b_offset_16p0); + const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag()); - int x = window_start_x; + const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag()); + const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag()); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Load the inputs. - const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x); - - // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness. - const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0))); - const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0))); - - const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0); - const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0); - const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0); - const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0); - - const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0); - const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0); - const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0); - const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0); - - const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18); - const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18); - const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18); - const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18); - - // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this. - const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00)); - const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01)); - const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10)); - const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11)); - - const auto vout_15p1_0 = wrapper::vcombine( - vout_15p1_00, - vout_15p1_01); - - const auto vout_15p1_1 = wrapper::vcombine( - vout_15p1_10, - vout_15p1_11); - const auto out_ptr = reinterpret_cast(out_it.ptr()); + int x = window_start_x; - const auto vout_8p0 = wrapper::vcombine( - wrapper::vqrshrn<2>(vout_15p1_0), - wrapper::vqrshrn<2>(vout_15p1_1)); - wrapper::vstore(out_ptr + x, vout_8p0); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the inputs. + const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x); + + // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness. + const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0))); + const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0))); + + const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0); + const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0); + const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0); + const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0); + + const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0); + const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0); + const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0); + const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0); + + const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18); + const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18); + const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18); + const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18); + + // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this. + const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00)); + const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01)); + const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10)); + const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11)); + + const auto vout_15p1_0 = wrapper::vcombine(vout_15p1_00, vout_15p1_01); + + const auto vout_15p1_1 = wrapper::vcombine(vout_15p1_10, vout_15p1_11); + const auto out_ptr = reinterpret_cast(out_it.ptr()); + + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<2>(vout_15p1_0), wrapper::vqrshrn<2>(vout_15p1_1)); + wrapper::vstore(out_ptr + x, vout_8p0); + } - //Process the left-over elements. - for(; x < window_end_x; ++x) - { + //Process the left-over elements. + for (; x < window_end_x; ++x) + { #ifdef __aarch64__ - out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t( - b_val) - b_offset_16p0)) + out_offset_14p18))); + out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>( + (multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(b_val) - b_offset_16p0)) + + out_offset_14p18))); #else //__aarch64__ - out_ptr[x] = utility::clamp(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset))); + out_ptr[x] = utility::clamp(support::cpp11::lround( + multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset))); #endif //__aarch64__ - } - }, - a_input_it, b_input_it, out_it); + } + }, + a_input_it, b_input_it, out_it); } else { @@ -481,82 +497,83 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d Iterator out_it(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto in0_ptr = reinterpret_cast(in0_it.ptr()); - const auto in1_ptr = reinterpret_cast(in1_it.ptr()); - const auto out_ptr = reinterpret_cast(out_it.ptr()); + win, + [&](const Coordinates &) + { + const auto in0_ptr = reinterpret_cast(in0_it.ptr()); + const auto in1_ptr = reinterpret_cast(in1_it.ptr()); + const auto out_ptr = reinterpret_cast(out_it.ptr()); - int x = window_start_x; + int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Load the inputs. - const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x); - const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x); - - // Widen the input elements to signed 16-bit regardless of the input signedness. - const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0))); - const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0))); - const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0))); - const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0))); - - const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0); - const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0); - const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0); - const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0); - - const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0); - const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0); - const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0); - const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0); - - const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00); - const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01); - const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10); - const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11); - - const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18); - const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18); - const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18); - const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18); - - // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this. - const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00)); - const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01)); - const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10)); - const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11)); - - const auto vout_14p2_0 = wrapper::vcombine( - vout_14p2_00, - vout_14p2_01); - - const auto vout_14p2_1 = wrapper::vcombine( - vout_14p2_10, - vout_14p2_11); - - const auto vout_8p0 = wrapper::vcombine( - wrapper::vqrshrn<2>(vout_14p2_0), - wrapper::vqrshrn<2>(vout_14p2_1)); - wrapper::vstore(out_ptr + x, vout_8p0); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the inputs. + const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x); + const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x); + + // Widen the input elements to signed 16-bit regardless of the input signedness. + const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0))); + const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0))); + const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0))); + const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0))); + + const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0); + const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0); + const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0); + const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0); + + const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0); + const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0); + const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0); + const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0); + + const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00); + const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01); + const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10); + const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11); + + const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18); + const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18); + const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18); + const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18); + + // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this. + const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00)); + const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01)); + const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10)); + const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11)); + + const auto vout_14p2_0 = wrapper::vcombine(vout_14p2_00, vout_14p2_01); + + const auto vout_14p2_1 = wrapper::vcombine(vout_14p2_10, vout_14p2_11); + + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<2>(vout_14p2_0), wrapper::vqrshrn<2>(vout_14p2_1)); + wrapper::vstore(out_ptr + x, vout_8p0); + } - //Process the left-over elements. - for(; x < window_end_x; ++x) - { + //Process the left-over elements. + for (; x < window_end_x; ++x) + { #ifdef __aarch64__ - out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t( - in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18))); + out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>( + wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * + (int32_t(in1_ptr[x]) - in1_offset_16p0)) + + out_offset_14p18))); #else //__aarch64__ - out_ptr[x] = utility::clamp(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset))); + out_ptr[x] = utility::clamp(support::cpp11::lround( + multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + + float(out_offset))); #endif //__aarch64__ - } - }, - in0_it, in1_it, out_it); + } + }, + in0_it, in1_it, out_it); } } -void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +void mul_saturate_QSYMM16_QSYMM16_QSYMM16( + const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) { const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); @@ -580,66 +597,61 @@ void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *sr const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); - const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; + const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset}; execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const qsymm16x8x2_t input1_q = + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const qsymm16x8x2_t input1_q = {{ vld1q_s16(input1_ptr + x), vld1q_s16(input1_ptr + x + 8), - } - }; - const qsymm16x8x2_t input2_q = - { - { + }}; + const qsymm16x8x2_t input2_q = {{ vld1q_s16(input2_ptr + x), vld1q_s16(input2_ptr + x + 8), - } - }; + }}; - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); - const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); + const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); - const float32x4x4_t out_f32x4x4 = - { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info); - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } + const float32x4x4_t out_f32x4x4 = { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - float tmp_in1 = static_cast(*(input1_ptr + x)) * input1_qua_info.scale; - float tmp_in2 = static_cast(*(input2_ptr + x)) * input2_qua_info.scale; - float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst, lrintf() has same rounding mode as vcombine_s16 - int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale); - qsymm16_t tmp_qua = static_cast(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - *(output_ptr + x) = tmp_qua; - } - }, - input1, input2, dst); + const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info); + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Dequantize inputs + float tmp_in1 = static_cast(*(input1_ptr + x)) * input1_qua_info.scale; + float tmp_in2 = static_cast(*(input2_ptr + x)) * input2_qua_info.scale; + float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst, lrintf() has same rounding mode as vcombine_s16 + int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale); + qsymm16_t tmp_qua = + static_cast(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + *(output_ptr + x) = tmp_qua; + } + }, + input1, input2, dst); } void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale) @@ -665,74 +677,60 @@ void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor * const auto window_end_x = static_cast(window.x().end()); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const qsymm16x8x2_t input1_q = + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const qsymm16x8x2_t input1_q = {{ vld1q_s16(input1_ptr + x), vld1q_s16(input1_ptr + x + 8), - } - }; - const qsymm16x8x2_t input2_q = - { - { + }}; + const qsymm16x8x2_t input2_q = {{ vld1q_s16(input2_ptr + x), vld1q_s16(input2_ptr + x + 8), - } - }; + }}; - const int32x4x4_t in1_s32 = - { - { + const int32x4x4_t in1_s32 = {{ vmovl_s16(vget_low_s16(input1_q.val[0])), vmovl_s16(vget_high_s16(input1_q.val[0])), vmovl_s16(vget_low_s16(input1_q.val[1])), vmovl_s16(vget_high_s16(input1_q.val[1])), - } - }; - const int32x4x4_t in2_s32 = - { - { + }}; + const int32x4x4_t in2_s32 = {{ vmovl_s16(vget_low_s16(input2_q.val[0])), vmovl_s16(vget_high_s16(input2_q.val[0])), vmovl_s16(vget_low_s16(input2_q.val[1])), vmovl_s16(vget_high_s16(input2_q.val[1])), - } - }; + }}; - const int32x4x4_t result = - { - { + const int32x4x4_t result = {{ vmulq_s32(in1_s32.val[0], in2_s32.val[0]), vmulq_s32(in1_s32.val[1], in2_s32.val[1]), vmulq_s32(in1_s32.val[2], in2_s32.val[2]), vmulq_s32(in1_s32.val[3], in2_s32.val[3]), - } - }; + }}; - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - vst1q_s32(output_ptr + x + 8, result.val[2]); - vst1q_s32(output_ptr + x + 12, result.val[3]); - } + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + vst1q_s32(output_ptr + x + 8, result.val[2]); + vst1q_s32(output_ptr + x + 12, result.val[3]); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - *(output_ptr + x) = tmp; - } - }, - input1, input2, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + *(output_ptr + x) = tmp; + } + }, + input1, input2, dst); } template @@ -757,79 +755,80 @@ void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const const auto window_end_x = static_cast(window.x().end()); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x); - const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x); + const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x); - uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1)); - const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2)); - uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1)); - const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2)); + uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1)); + const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2)); + uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1)); + const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2)); - tmp1_high = vmulq_u16(tmp1_high, tmp2_high); - tmp1_low = vmulq_u16(tmp1_low, tmp2_low); + tmp1_high = vmulq_u16(tmp1_high, tmp2_high); + tmp1_low = vmulq_u16(tmp1_low, tmp2_low); - if(is_scale255) - { - tmp1_high = scale255_U16_U16(tmp1_high); - tmp1_low = scale255_U16_U16(tmp1_low); - } - else - { - const int16x8_t vn = vdupq_n_s16(-n); + if (is_scale255) + { + tmp1_high = scale255_U16_U16(tmp1_high); + tmp1_low = scale255_U16_U16(tmp1_low); + } + else + { + const int16x8_t vn = vdupq_n_s16(-n); - if(is_sat) + if (is_sat) + { + tmp1_high = vqshlq_u16(tmp1_high, vn); + tmp1_low = vqshlq_u16(tmp1_low, vn); + } + else + { + tmp1_high = vshlq_u16(tmp1_high, vn); + tmp1_low = vshlq_u16(tmp1_low, vn); + } + } + if (is_sat) { - tmp1_high = vqshlq_u16(tmp1_high, vn); - tmp1_low = vqshlq_u16(tmp1_low, vn); + vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high))); } else { - tmp1_high = vshlq_u16(tmp1_high, vn); - tmp1_low = vshlq_u16(tmp1_low, vn); + vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high))); } } - if(is_sat) - { - vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high))); - } - else - { - vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high))); - } - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - uint16_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(is_scale255) - { - float tmp_f = static_cast(tmp) * scale255_constant; - tmp = static_cast(tmp_f + 0.5f); - } - else - { - tmp >>= n; - } - if(is_sat && tmp > 255) + // Compute left-over elements + for (; x < window_end_x; ++x) { - tmp = 255; + uint16_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + + if (is_scale255) + { + float tmp_f = static_cast(tmp) * scale255_constant; + tmp = static_cast(tmp_f + 0.5f); + } + else + { + tmp >>= n; + } + if (is_sat && tmp > 255) + { + tmp = 255; + } + *(output_ptr + x) = static_cast(tmp); } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); + }, + input1, input2, dst); } template @@ -843,7 +842,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t & tmp1_high = vmulq_s32(tmp1_high, tmp2_high); tmp1_low = vmulq_s32(tmp1_low, tmp2_low); - if(is_scale255) + if (is_scale255) { tmp1_high = scale255_S32_S32(tmp1_high); tmp1_low = scale255_S32_S32(tmp1_low); @@ -863,7 +862,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t & const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low); const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s); const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s); - if(is_sat) + if (is_sat) { tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); @@ -875,7 +874,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t & } } - if(is_sat) + if (is_sat) { return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high)); } @@ -888,15 +887,10 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t & template inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n) { - const int16x8x2_t result = - { - { - // First 8 elements - mul_S16_S16_S16_n_loop(src1.val[0], src2.val[0], n), - // Second 8 elements - mul_S16_S16_S16_n_loop(src1.val[1], src2.val[1], n) - } - }; + const int16x8x2_t result = {{// First 8 elements + mul_S16_S16_S16_n_loop(src1.val[0], src2.val[0], n), + // Second 8 elements + mul_S16_S16_S16_n_loop(src1.val[1], src2.val[1], n)}}; return result; } @@ -923,67 +917,62 @@ void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, con const auto window_end_x = static_cast(window.x().end()); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const int16x8x2_t ta1 = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const int16x8x2_t ta2 = - { - { - vld1q_s16(input2_ptr + x), - vld1q_s16(input2_ptr + x + 8), - } - }; - const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); - - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t ta1 = {{ + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + }}; + const int16x8x2_t ta2 = {{ + vld1q_s16(input2_ptr + x), + vld1q_s16(input2_ptr + x + 8), + }}; + const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); + + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } - if(is_scale255) + // Compute left-over elements + for (; x < window_end_x; ++x) { - float tmp_f = static_cast(tmp) * scale255_constant; + int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - tmp = static_cast(tmp_f + 0.5f); - } - else - { - if(tmp >= 0) + if (is_scale255) { - tmp >>= n; + float tmp_f = static_cast(tmp) * scale255_constant; + + tmp = static_cast(tmp_f + 0.5f); } else { - uint32_t mask = (1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; + if (tmp >= 0) + { + tmp >>= n; + } + else + { + uint32_t mask = (1u << n) - 1; + tmp = (tmp + static_cast(mask)) >> n; + } } + if (is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + } + *(output_ptr + x) = static_cast(tmp); } - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); + }, + input1, input2, dst); } template @@ -1012,7 +1001,7 @@ inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t & const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63); const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2); const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s); - if(is_sat) + if (is_sat) { tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn); tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn); @@ -1029,15 +1018,10 @@ inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t & template inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n) { - const int32x4x2_t result = - { - { - // First 4 elements - mul_S32_S32_S32_n_loop(src1.val[0], src2.val[0], n), - // Second 4 elements - mul_S32_S32_S32_n_loop(src1.val[1], src2.val[1], n) - } - }; + const int32x4x2_t result = {{// First 4 elements + mul_S32_S32_S32_n_loop(src1.val[0], src2.val[0], n), + // Second 4 elements + mul_S32_S32_S32_n_loop(src1.val[1], src2.val[1], n)}}; return result; } @@ -1058,7 +1042,7 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con const auto window_end_x = static_cast(window.x().end()); const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -1074,60 +1058,56 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); - const int32_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = vdupq_n_s32(broadcast_value); + const int32_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = vdupq_n_s32(broadcast_value); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int32x4x2_t broadcast_v = - { - { - broadcast_value_vec, - broadcast_value_vec, - } - }; - const int32x4x2_t non_broadcast_v = + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int32x4x2_t broadcast_v = {{ + broadcast_value_vec, + broadcast_value_vec, + }}; + const int32x4x2_t non_broadcast_v = {{ vld1q_s32(non_broadcast_input_ptr + x), vld1q_s32(non_broadcast_input_ptr + x + 4), - } - }; - const int32x4x2_t result = mul_S32_S32_S32_n_k(broadcast_v, non_broadcast_v, n); - - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - } + }}; + const int32x4x2_t result = mul_S32_S32_S32_n_k(broadcast_v, non_broadcast_v, n); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int64_t tmp = static_cast(broadcast_value) * static_cast(*(non_broadcast_input_ptr + x)); - - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint64_t mask = ((uint64_t)1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); } - if(is_sat) + + // Compute left-over elements + for (; x < window_end_x; ++x) { - tmp = utility::clamp(tmp); + int64_t tmp = + static_cast(broadcast_value) * static_cast(*(non_broadcast_input_ptr + x)); + + if (tmp >= 0) + { + tmp >>= n; + } + else + { + uint64_t mask = ((uint64_t)1u << n) - 1; + tmp = (tmp + static_cast(mask)) >> n; + } + if (is_sat) + { + tmp = utility::clamp(tmp); + } + *(output_ptr + x) = static_cast(tmp); } - *(output_ptr + x) = static_cast(tmp); - } - }, - broadcast_input, non_broadcast_input, dst); + }, + broadcast_input, non_broadcast_input, dst); } else { @@ -1140,58 +1120,53 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const int32x4x2_t ta1 = + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x2_t ta1 = {{ + vld1q_s32(input1_ptr + x), + vld1q_s32(input1_ptr + x + 4), + }}; + const int32x4x2_t ta2 = {{ + vld1q_s32(input2_ptr + x), + vld1q_s32(input2_ptr + x + 4), + }}; + const int32x4x2_t result = mul_S32_S32_S32_n_k(ta1, ta2, n); + + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) { + int64_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + + if (tmp >= 0) { - vld1q_s32(input1_ptr + x), - vld1q_s32(input1_ptr + x + 4), + tmp >>= n; } - }; - const int32x4x2_t ta2 = - { + else { - vld1q_s32(input2_ptr + x), - vld1q_s32(input2_ptr + x + 4), + uint64_t mask = ((uint64_t)1u << n) - 1; + tmp = (tmp + static_cast(mask)) >> n; } - }; - const int32x4x2_t result = mul_S32_S32_S32_n_k(ta1, ta2, n); - - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int64_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint64_t mask = ((uint64_t)1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; - } - if(is_sat) - { - tmp = utility::clamp(tmp); + if (is_sat) + { + tmp = utility::clamp(tmp); + } + *(output_ptr + x) = static_cast(tmp); } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); + }, + input1, input2, dst); } } @@ -1212,7 +1187,7 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -1228,32 +1203,33 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); - const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec); - wrapper::vstore(output_ptr + x, res); - } + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec); + wrapper::vstore(output_ptr + x, res); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; - } - }, - broadcast_input, non_broadcast_input, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); } else { @@ -1266,32 +1242,33 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const auto ta1 = wrapper::vloadq(input1_ptr + x); - const auto ta2 = wrapper::vloadq(input2_ptr + x); - const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); - const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec); - wrapper::vstore(output_ptr + x, res); - } + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto ta1 = *(input1_ptr + x); - const auto ta2 = *(input2_ptr + x); - *(output_ptr + x) = ta1 * ta2 * scale; - } - }, - input1, input2, dst); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto ta1 = wrapper::vloadq(input1_ptr + x); + const auto ta2 = wrapper::vloadq(input2_ptr + x); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); } } @@ -1312,7 +1289,7 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -1328,48 +1305,49 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); - const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x); - float32x4_t b = vdupq_n_f32(broadcast_value); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x); + float32x4_t b = vdupq_n_f32(broadcast_value); - const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; - const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); - const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); - const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); - const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); + const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f}; + const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); + const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); + const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); + const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); - const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); - const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); + const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); + const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); - float32x4_t res = wrapper::vmul(tmp0, b); - b = wrapper::vmul(b, mask); + float32x4_t res = wrapper::vmul(tmp0, b); + b = wrapper::vmul(b, mask); - res = wrapper::vmla(res, tmp1, b); - wrapper::vstore(output_ptr + 2 * x, res); - } + res = wrapper::vmla(res, tmp1, b); + wrapper::vstore(output_ptr + 2 * x, res); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x); - const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1); - auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1); - auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0); - *(output_ptr + 2 * x) = res1; - *(output_ptr + 2 * x + 1) = res2; - } - }, - broadcast_input, non_broadcast_input, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x); + const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1); + auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1); + auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0); + *(output_ptr + 2 * x) = res1; + *(output_ptr + 2 * x + 1) = res2; + } + }, + broadcast_input, non_broadcast_input, dst); } else { @@ -1382,51 +1360,52 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x); - float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); - const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; - const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); - const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); - const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); - const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x); + float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x); - const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); - const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); + const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f}; + const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); + const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); + const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); + const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); - float32x4_t res = wrapper::vmul(tmp0, b); + const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); + const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); - b = wrapper::vrev64(b); - b = wrapper::vmul(b, mask); + float32x4_t res = wrapper::vmul(tmp0, b); - res = wrapper::vmla(res, tmp1, b); - wrapper::vstore(output_ptr + 2 * x, res); - } + b = wrapper::vrev64(b); + b = wrapper::vmul(b, mask); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto a0 = *(input1_ptr + 2 * x); - const auto a1 = *(input1_ptr + 2 * x + 1); - const auto b0 = *(input2_ptr + 2 * x); - const auto b1 = *(input2_ptr + 2 * x + 1); - auto res1 = a0 * b0 - a1 * b1; - auto res2 = a0 * b1 + a1 * b0; - *(output_ptr + 2 * x) = res1; - *(output_ptr + 2 * x + 1) = res2; - } - }, - input1, input2, dst); + res = wrapper::vmla(res, tmp1, b); + wrapper::vstore(output_ptr + 2 * x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto a0 = *(input1_ptr + 2 * x); + const auto a1 = *(input1_ptr + 2 * x + 1); + const auto b0 = *(input2_ptr + 2 * x); + const auto b1 = *(input2_ptr + 2 * x + 1); + auto res1 = a0 * b0 - a1 * b1; + auto res2 = a0 * b1 + a1 * b0; + *(output_ptr + 2 * x) = res1; + *(output_ptr + 2 * x + 1) = res2; + } + }, + input1, input2, dst); } } @@ -1444,7 +1423,7 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -1457,48 +1436,40 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float16x8x2_t broadcast_value_vec = + win, + [&](const Coordinates &) { - { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float16x8x2_t broadcast_value_vec = {{ vdupq_n_f16(broadcast_value), vdupq_n_f16(broadcast_value), - } - }; - const auto scale_vec = vdupq_n_f16(scale); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t non_broadcast_v = + }}; + const auto scale_vec = vdupq_n_f16(scale); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float16x8x2_t non_broadcast_v = {{ vld1q_f16(non_broadcast_input_ptr + x), vld1q_f16(non_broadcast_input_ptr + x + 8), - } - }; - const float16x8x2_t result = + }}; + const float16x8x2_t result = {{ + vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec), + vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec), + }}; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) { - { - vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec), - vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec), - } - }; - vst1q_f16(output_ptr + x, result.val[0]); - vst1q_f16(output_ptr + x + 8, result.val[1]); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; - } - }, - broadcast_input, non_broadcast_input, dst); + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); } else { @@ -1508,49 +1479,41 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator input2(src2, input2_win); Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t ta1 = + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_f16(input1_ptr + x), - vld1q_f16(input1_ptr + x + 8), - } - }; - const float16x8x2_t ta2 = - { - { - vld1q_f16(input2_ptr + x), - vld1q_f16(input2_ptr + x + 8), - } - }; - const float16x8_t scale_vec = vdupq_n_f16(scale); - const float16x8x2_t result = + const float16x8x2_t ta1 = {{ + vld1q_f16(input1_ptr + x), + vld1q_f16(input1_ptr + x + 8), + }}; + const float16x8x2_t ta2 = {{ + vld1q_f16(input2_ptr + x), + vld1q_f16(input2_ptr + x + 8), + }}; + const float16x8_t scale_vec = vdupq_n_f16(scale); + const float16x8x2_t result = {{ + vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), + vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), + }}; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) { - { - vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), - vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), - } - }; - vst1q_f16(output_ptr + x, result.val[0]); - vst1q_f16(output_ptr + x + 8, result.val[1]); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto ta1 = *(input1_ptr + x); - const auto ta2 = *(input2_ptr + x); - *(output_ptr + x) = ta1 * ta2 * scale; - } - }, - input1, input2, dst); + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); } } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ @@ -1577,81 +1540,82 @@ void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const const auto window_end_x = static_cast(window.x().end()); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const uint8x16_t bv = wrapper::vloadq(input2_ptr + x); - const uint8x16_t av = wrapper::vloadq(input1_ptr + x); - - uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av)); - uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av)); - tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv))); - tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv))); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); - if(is_scale255) - { - tmp_low = scale255_U16_U16(tmp_low); - tmp_high = scale255_U16_U16(tmp_high); - } - else + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const int16x8_t vn = vdupq_n_s16(-n); + const uint8x16_t bv = wrapper::vloadq(input2_ptr + x); + const uint8x16_t av = wrapper::vloadq(input1_ptr + x); - if(is_sat) + uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av)); + uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av)); + tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv))); + tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv))); + + if (is_scale255) { - tmp_low = vqshlq_u16(tmp_low, vn); - tmp_high = vqshlq_u16(tmp_high, vn); + tmp_low = scale255_U16_U16(tmp_low); + tmp_high = scale255_U16_U16(tmp_high); } else { - tmp_low = vshlq_u16(tmp_low, vn); - tmp_high = vshlq_u16(tmp_high, vn); + const int16x8_t vn = vdupq_n_s16(-n); + + if (is_sat) + { + tmp_low = vqshlq_u16(tmp_low, vn); + tmp_high = vqshlq_u16(tmp_high, vn); + } + else + { + tmp_low = vshlq_u16(tmp_low, vn); + tmp_high = vshlq_u16(tmp_high, vn); + } } - } - if(is_sat) - { - static const uint16x8_t max = vdupq_n_u16(SHRT_MAX); + if (is_sat) + { + static const uint16x8_t max = vdupq_n_u16(SHRT_MAX); - tmp_low = vminq_u16(tmp_low, max); - tmp_high = vminq_u16(tmp_high, max); + tmp_low = vminq_u16(tmp_low, max); + tmp_high = vminq_u16(tmp_high, max); + } + + vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low)); + vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high)); } - vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low)); - vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high)); - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + if (is_scale255) + { + float tmp_f = static_cast(tmp) * scale255_constant; + tmp = static_cast(tmp_f + 0.5f); + } + else + { + tmp >>= n; + } - if(is_scale255) - { - float tmp_f = static_cast(tmp) * scale255_constant; - tmp = static_cast(tmp_f + 0.5f); - } - else - { - tmp >>= n; - } + if (is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp; + } - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp; + *(output_ptr + x) = static_cast(tmp); } - - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); + }, + input1, input2, dst); } template @@ -1676,75 +1640,65 @@ void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, cons const auto window_end_x = static_cast(window.x().end()); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const int16x8x2_t ta1 = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const uint8x8x2_t ta2u = + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int16x8x2_t ta1 = {{ + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + }}; + const uint8x8x2_t ta2u = {{ vld1_u8(input2_ptr + x), vld1_u8(input2_ptr + x + 8), - } - }; - const int16x8x2_t ta2 = - { - { - vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), - vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1])) - } - }; - - const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); + }}; + const int16x8x2_t ta2 = { + {vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))}}; - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } + const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } - if(is_scale255) + // Compute left-over elements + for (; x < window_end_x; ++x) { - float tmp_f = static_cast(tmp) * scale255_constant; + int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - tmp = static_cast(tmp_f + 0.5f); - } - else - { - if(tmp >= 0) + if (is_scale255) { - tmp >>= n; + float tmp_f = static_cast(tmp) * scale255_constant; + + tmp = static_cast(tmp_f + 0.5f); } else { - uint32_t mask = (1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; + if (tmp >= 0) + { + tmp >>= n; + } + else + { + uint32_t mask = (1u << n) - 1; + tmp = (tmp + static_cast(mask)) >> n; + } + } + if (is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); } + *(output_ptr + x) = static_cast(tmp); } - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); + }, + input1, input2, dst); } template @@ -1755,7 +1709,12 @@ void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, cons } } // namespace -void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +void CpuMulKernel::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) { ARM_COMPUTE_UNUSED(rounding_policy); ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); @@ -1775,7 +1734,7 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * bool is_scale_255 = false; // Check and validate scaling factor - if(std::abs(scale - scale255_constant) < 0.00001f) + if (std::abs(scale - scale255_constant) < 0.00001f) { is_scale_255 = true; } @@ -1795,12 +1754,12 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * const DataType dt_output = dst->data_type(); const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE); - switch(dt_input1) + switch (dt_input1) { case DataType::QASYMM8: - if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8) + if (dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8) { - if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) + if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) { _func_quantized = &mul_q8_neon_fixedpoint; } @@ -1811,9 +1770,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * } break; case DataType::QASYMM8_SIGNED: - if(dt_input2 == DataType::QASYMM8_SIGNED) + if (dt_input2 == DataType::QASYMM8_SIGNED) { - if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) + if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) { _func_quantized = &mul_q8_neon_fixedpoint; } @@ -1824,19 +1783,19 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * } break; case DataType::QSYMM16: - if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16) + if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16) { _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16; } - else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32) + else if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32) { _func_int = &mul_QSYMM16_QSYMM16_S32; } break; case DataType::S16: - if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) + if (DataType::U8 == dt_input2 && DataType::S16 == dt_output) { - if(is_scale_255) + if (is_scale_255) { _func_int = is_sat ? &mul_S16_U8_S16 : &mul_S16_U8_S16; } @@ -1845,9 +1804,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * _func_int = is_sat ? &mul_S16_U8_S16 : &mul_S16_U8_S16; } } - if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) + if (DataType::S16 == dt_input2 && DataType::S16 == dt_output) { - if(is_scale_255) + if (is_scale_255) { _func_int = is_sat ? &mul_S16_S16_S16 : &mul_S16_S16_S16; } @@ -1858,15 +1817,15 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * } break; case DataType::S32: - if(DataType::S32 == dt_input2 && DataType::S32 == dt_output) + if (DataType::S32 == dt_input2 && DataType::S32 == dt_output) { _func_int = is_sat ? &mul_S32_S32_S32 : &mul_S32_S32_S32; } break; case DataType::U8: - if(DataType::U8 == dt_input2 && DataType::U8 == dt_output) + if (DataType::U8 == dt_input2 && DataType::U8 == dt_output) { - if(is_scale_255) + if (is_scale_255) { _func_int = is_sat ? &mul_U8_U8_U8 : &mul_U8_U8_U8; } @@ -1875,9 +1834,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * _func_int = is_sat ? &mul_U8_U8_U8 : &mul_U8_U8_U8; } } - else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) + else if (DataType::U8 == dt_input2 && DataType::S16 == dt_output) { - if(is_scale_255) + if (is_scale_255) { _func_int = is_sat ? &mul_U8_U8_S16 : &mul_U8_U8_S16; } @@ -1886,9 +1845,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * _func_int = is_sat ? &mul_U8_U8_S16 : &mul_U8_U8_S16; } } - else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) + else if (DataType::S16 == dt_input2 && DataType::S16 == dt_output) { - if(is_scale_255) + if (is_scale_255) { _func_int = is_sat ? &mul_U8_S16_S16 : &mul_U8_S16_S16; } @@ -1922,20 +1881,20 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const ARM_COMPUTE_UNUSED(thread_count); #if defined(ENABLE_FP32_KERNELS) - if(this->_func_float == &mul_F32_F32_F32) + if (this->_func_float == &mul_F32_F32_F32) { size_t mws = ICPPKernel::default_mws; - if(platform.get_cpu_model() == CPUModel::N1) + if (platform.get_cpu_model() == CPUModel::N1) { mws = default_mws_N1_fp32_neon; } - else if(platform.get_cpu_model() == CPUModel::V1) + else if (platform.get_cpu_model() == CPUModel::V1) { mws = default_mws_V1_fp32_neon; } else { - if(_split_dimension == Window::DimX) + if (_split_dimension == Window::DimX) { // Don't split the work load too small if the tensor has been reinterpreted as 1D. // This number is loosely chosen as threading overhead in each platform varies wildly. @@ -1945,7 +1904,7 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const } // tensor is 1D or was re-interpreted as 1D - if(this->window().shape().num_dimensions() == 1) + if (this->window().shape().num_dimensions() == 1) { return mws; } @@ -1958,10 +1917,10 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const return std::max(static_cast(1), mws); } } -#else /* ENABLE_FP32_KERNELS */ +#else /* ENABLE_FP32_KERNELS */ ARM_COMPUTE_UNUSED(platform); #endif /* ENABLE_FP32_KERNELS */ - if(_split_dimension == Window::DimX) + if (_split_dimension == Window::DimX) { // Don't split the work load too small if the tensor has been reinterpreted as 1D. // This number is loosely chosen as threading overhead in each platform varies wildly. @@ -1970,8 +1929,12 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const return default_mws; } -Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy) +Status CpuMulKernel::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); @@ -1989,11 +1952,11 @@ void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const Thre auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto dst = tensors.get_tensor(TensorType::ACL_DST); - if(_func_quantized != nullptr) + if (_func_quantized != nullptr) { (*_func_quantized)(src1, src2, dst, window, _scale); } - else if(_func_int != nullptr) + else if (_func_int != nullptr) { (*_func_int)(src1, src2, dst, window, _scale_exponent); } @@ -2021,10 +1984,11 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst"); } return Status{}; diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h index 9e4a37110b..7eaf287507 100644 --- a/src/cpu/kernels/CpuMulKernel.h +++ b/src/cpu/kernels/CpuMulKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_MUL_KERNEL_H #include "arm_compute/core/Rounding.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -68,17 +69,27 @@ public: * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype * @param[in] rounding_policy Rounding policy. */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuMulKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy); // Inherited methods overridden - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; /** Return minimum workload size of the relevant kernel @@ -108,7 +119,8 @@ private: * @param[in] window Region on which to execute the kernel * @param[in] scale Integer scale factor. */ - using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale); + using MulFunctionInt = + void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale); /** Common signature for all the specialised multiplication functions with float scaling factor * * @param[in] src1 Src1 tensor object. @@ -117,7 +129,8 @@ private: * @param[in] window Region on which to execute the kernel * @param[in] scale Float scale factor. */ - using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); + using MulFunctionFloat = + void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor * * @param[in] src1 Src1 tensor object. @@ -127,14 +140,15 @@ private: * @param[in] scale Float scale factor. * */ - using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); + using MulFunctionQuantized = + void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); - MulFunctionFloat *_func_float{ nullptr }; - MulFunctionInt *_func_int{ nullptr }; - MulFunctionQuantized *_func_quantized{ nullptr }; - float _scale{ 0 }; - int _scale_exponent{ 0 }; - size_t _split_dimension{ Window::DimY }; + MulFunctionFloat *_func_float{nullptr}; + MulFunctionInt *_func_int{nullptr}; + MulFunctionQuantized *_func_quantized{nullptr}; + float _scale{0}; + int _scale_exponent{0}; + size_t _split_dimension{Window::DimY}; }; /** Interface for the complex pixelwise multiplication kernel. */ @@ -159,7 +173,7 @@ public: static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuPermuteKernel.cpp b/src/cpu/kernels/CpuPermuteKernel.cpp index d65e011032..b444a25ff7 100644 --- a/src/cpu/kernels/CpuPermuteKernel.cpp +++ b/src/cpu/kernels/CpuPermuteKernel.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -48,56 +49,31 @@ namespace { inline bool is_permutation_supported(const PermutationVector &v) { - static const std::array permutations2 = - { - { - PermutationVector(0U, 1U), - PermutationVector(1U, 0U), - } - }; - static const std::array permutations3 = - { - { - PermutationVector(2U, 0U, 1U), - PermutationVector(1U, 2U, 0U), - PermutationVector(0U, 1U, 2U), - PermutationVector(0U, 2U, 1U), - PermutationVector(1U, 0U, 2U), - PermutationVector(2U, 1U, 0U), - } - }; - static const std::array permutations4 = - { - { - PermutationVector(0U, 1U, 2U, 3U), - PermutationVector(1U, 0U, 2U, 3U), - PermutationVector(2U, 0U, 1U, 3U), - PermutationVector(0U, 2U, 1U, 3U), - PermutationVector(1U, 2U, 0U, 3U), - PermutationVector(2U, 1U, 0U, 3U), - PermutationVector(2U, 1U, 3U, 0U), - PermutationVector(1U, 2U, 3U, 0U), - PermutationVector(3U, 2U, 1U, 0U), - PermutationVector(2U, 3U, 1U, 0U), - PermutationVector(1U, 3U, 2U, 0U), - PermutationVector(3U, 1U, 2U, 0U), - PermutationVector(3U, 0U, 2U, 1U), - PermutationVector(0U, 3U, 2U, 1U), - PermutationVector(2U, 3U, 0U, 1U), - PermutationVector(3U, 2U, 0U, 1U), - PermutationVector(0U, 2U, 3U, 1U), - PermutationVector(2U, 0U, 3U, 1U), - PermutationVector(1U, 0U, 3U, 2U), - PermutationVector(0U, 1U, 3U, 2U), - PermutationVector(3U, 1U, 0U, 2U), - PermutationVector(1U, 3U, 0U, 2U), - PermutationVector(0U, 3U, 1U, 2U), - PermutationVector(3U, 0U, 1U, 2U) - } - }; + static const std::array permutations2 = {{ + PermutationVector(0U, 1U), + PermutationVector(1U, 0U), + }}; + static const std::array permutations3 = {{ + PermutationVector(2U, 0U, 1U), + PermutationVector(1U, 2U, 0U), + PermutationVector(0U, 1U, 2U), + PermutationVector(0U, 2U, 1U), + PermutationVector(1U, 0U, 2U), + PermutationVector(2U, 1U, 0U), + }}; + static const std::array permutations4 = { + {PermutationVector(0U, 1U, 2U, 3U), PermutationVector(1U, 0U, 2U, 3U), PermutationVector(2U, 0U, 1U, 3U), + PermutationVector(0U, 2U, 1U, 3U), PermutationVector(1U, 2U, 0U, 3U), PermutationVector(2U, 1U, 0U, 3U), + PermutationVector(2U, 1U, 3U, 0U), PermutationVector(1U, 2U, 3U, 0U), PermutationVector(3U, 2U, 1U, 0U), + PermutationVector(2U, 3U, 1U, 0U), PermutationVector(1U, 3U, 2U, 0U), PermutationVector(3U, 1U, 2U, 0U), + PermutationVector(3U, 0U, 2U, 1U), PermutationVector(0U, 3U, 2U, 1U), PermutationVector(2U, 3U, 0U, 1U), + PermutationVector(3U, 2U, 0U, 1U), PermutationVector(0U, 2U, 3U, 1U), PermutationVector(2U, 0U, 3U, 1U), + PermutationVector(1U, 0U, 3U, 2U), PermutationVector(0U, 1U, 3U, 2U), PermutationVector(3U, 1U, 0U, 2U), + PermutationVector(1U, 3U, 0U, 2U), PermutationVector(0U, 3U, 1U, 2U), PermutationVector(3U, 0U, 1U, 2U)}}; - return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) - || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v)); + return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || + (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) || + (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v)); } Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) @@ -108,7 +84,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); // Validate configured destination - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); @@ -128,18 +104,22 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others // we have to fall back to C++ - if((src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) || (src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U })) + if ((src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) || + (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U})) { - window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start())); - window_src.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start())); - window_src.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start())); + window_src.set(Window::DimX, + Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start())); + window_src.set(Window::DimY, + Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start())); + window_src.set(Window::DimZ, + Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start())); window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start())); } // Destination window Window window_dst(window); const Window::Dimension zero_window = Window::Dimension(0, 0, 0); - for(size_t d = 0; d <= dst->info()->num_dimensions(); ++d) + for (size_t d = 0; d <= dst->info()->num_dimensions(); ++d) { window_dst.set(d, zero_window); } @@ -157,7 +137,7 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c int n_channels = 0; int n_batches = 0; - switch(src_layout) + switch (src_layout) { case DataLayout::NCHW: { @@ -189,38 +169,42 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c } // CHW -> HWC - if(src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) + if (src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) { const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T); const int out_col_stride = dst->info()->strides_in_bytes().y() / sizeof(T); const int out_row_stride = dst->info()->strides_in_bytes().z() / sizeof(T); const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T); - execute_window_loop(window_src, [&](const Coordinates & id) - { - const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride; - reorder::nchw_to_nhwc(reinterpret_cast(src_it.ptr()), reinterpret_cast(dst_it.ptr()) + idx, - n_batches, n_channels, n_rows, n_cols, - in_batch_stride, in_channel_stride, in_row_stride, - out_batch_stride, out_row_stride, out_col_stride); - }, - src_it, dst_it); + execute_window_loop( + window_src, + [&](const Coordinates &id) + { + const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride; + reorder::nchw_to_nhwc(reinterpret_cast(src_it.ptr()), + reinterpret_cast(dst_it.ptr()) + idx, n_batches, n_channels, n_rows, n_cols, + in_batch_stride, in_channel_stride, in_row_stride, out_batch_stride, + out_row_stride, out_col_stride); + }, + src_it, dst_it); } // HWC -> CHW - else if(src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U }) + else if (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U}) { const int out_col_stride = dst->info()->strides_in_bytes().x() / sizeof(T); const int out_row_stride = dst->info()->strides_in_bytes().y() / sizeof(T); const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T); const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T); - execute_window_loop(window_src, [&](const Coordinates & id) - { - const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride; - reorder::nhwc_to_nchw(reinterpret_cast(src_it.ptr()), reinterpret_cast(dst_it.ptr()) + idx, - n_batches, n_rows, n_cols, n_channels, - in_batch_stride, in_row_stride, in_col_stride, - out_batch_stride, out_channel_stride, out_row_stride); - }, - src_it, dst_it); + execute_window_loop( + window_src, + [&](const Coordinates &id) + { + const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride; + reorder::nhwc_to_nchw(reinterpret_cast(src_it.ptr()), + reinterpret_cast(dst_it.ptr()) + idx, n_batches, n_rows, n_cols, n_channels, + in_batch_stride, in_row_stride, in_col_stride, out_batch_stride, + out_channel_stride, out_row_stride); + }, + src_it, dst_it); } else { @@ -230,12 +214,15 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c Strides perm_strides = strides; permute_strides(perm_strides, perm); const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0; - execute_window_loop(window, [&](const Coordinates & id) - { - const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3; - *(reinterpret_cast(dst_it.ptr() + idx)) = *(reinterpret_cast(src_it.ptr())); - }, - src_it, dst_it); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int idx = + id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3; + *(reinterpret_cast(dst_it.ptr() + idx)) = *(reinterpret_cast(src_it.ptr())); + }, + src_it, dst_it); } } } // namespace @@ -275,7 +262,7 @@ void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - switch(src->info()->element_size()) + switch (src->info()->element_size()) { case 1: run_permute(window, src, dst, _perm); diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h index 9e1b93318e..0cb2faf223 100644 --- a/src/cpu/kernels/CpuPermuteKernel.h +++ b/src/cpu/kernels/CpuPermuteKernel.h @@ -57,7 +57,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp index d72a41cbbe..9308d860d1 100644 --- a/src/cpu/kernels/CpuPool2dKernel.cpp +++ b/src/cpu/kernels/CpuPool2dKernel.cpp @@ -25,15 +25,17 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/cpu/kernels/pool2d/neon/list.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + #include namespace arm_compute @@ -46,99 +48,111 @@ namespace { using namespace misc::shape_calculator; -static const std::vector available_kernels = -{ - { - "neon_qu8_nhwc_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc) - }, - { - "neon_qs8_nhwc_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc) - }, - { - "neon_f16_nhwc_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc) - }, - { - "neon_fp32_nhwc_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); }, - REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc) - }, +static const std::vector available_kernels = { + {"neon_qu8_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)}, + {"neon_qs8_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)}, + {"neon_f16_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)}, + {"neon_fp32_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); }, + REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)}, #if defined(ENABLE_NCHW_KERNELS) - { - "neon_qu8_nchw_pool2", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw) - }, - { - "neon_qu8_nchw_pool3", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw) - }, - { - "neon_qu8_nchw_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw) - }, - { - "neon_qs8_nchw_pool2", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw) - }, - { - "neon_qs8_nchw_pool3", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw) - }, - { - "neon_qs8_nchw_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw) - }, - { - "neon_fp16_nchw_pool2", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); }, - REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw) - }, - { - "neon_fp16_nchw_pool3", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); }, - REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw) - }, - { - "neon_fp16_nchw_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); }, - REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw) - }, - { - "neon_fp32_nchw_pool2", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); }, - REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw) - }, - { - "neon_fp32_nchw_pool3", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); }, - REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw) - }, - { - "neon_fp32_nchw_pool7", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); }, - REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw) - }, - { - "neon_fp32_nchw_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); }, - REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw) - }, + {"neon_qu8_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw)}, + {"neon_qu8_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw)}, + {"neon_qu8_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw)}, + {"neon_qs8_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw)}, + {"neon_qs8_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw)}, + {"neon_qs8_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw)}, + {"neon_fp16_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); + }, + REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)}, + {"neon_fp16_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); + }, + REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)}, + {"neon_fp16_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); }, + REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)}, + {"neon_fp32_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); + }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)}, + {"neon_fp32_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); + }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)}, + {"neon_fp32_nchw_pool7", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); + }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)}, + {"neon_fp32_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); }, + REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)}, #endif /* defined(ENABLE_NCHW_KERNELS) */ }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, - const ITensorInfo *indices, Size2D pool_size) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices, + Size2D pool_size) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0); @@ -150,65 +164,78 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const int output_height = 0; PoolingType pool_type = pool_info.pool_type; const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; - const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) - && (is_pool_region_entirely_outside_input(pool_info)), - "Pooling region that is entirely outside input tensor is unsupported for non-float types"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (!is_data_type_float(src->data_type())) && (is_pool_region_entirely_outside_input(pool_info)), + "Pooling region that is entirely outside input tensor is unsupported for non-float types"); - std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], - pool_size.x(), pool_size.y(), pool_info.pad_stride_info); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid"); + std::tie(output_width, output_height) = + scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size.x(), + pool_size.y(), pool_info.pad_stride_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), + "Calculated output dimension size is invalid"); TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type())); std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - if(indices) + if (indices) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, + "Pooling indices only supported for MAX pooling method"); } - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() - && (src->data_layout() == DataLayout::NHWC), - "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && + (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() && + (src->data_layout() == DataLayout::NHWC), + "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); - if(indices) + if (indices) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices), "Pooling indices returning source tensor coordinates is only supported for pool size 2x2"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC), "Pooling kernel indices only supported for NHWC"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + ((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices), + "Pooling indices returning source tensor coordinates is only supported for pool size 2x2"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC), + "Pooling kernel indices only supported for NHWC"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info); } } - const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa() }); + const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ + src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info, - unsigned int &num_elems_processed_per_iteration, - int pool_size_x, int pool_size_y) +std::pair validate_and_configure_window(ITensorInfo *src, + ITensorInfo *dst, + ITensorInfo *indices, + const PoolingLayerInfo &pool_info, + unsigned int &num_elems_processed_per_iteration, + int pool_size_x, + int pool_size_y) { // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info))); - if(indices) + if (indices) { // Indices auto inizialitation if not yet initialized - auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, - pool_info))) - .set_data_type(DataType::U32) /* we store the offset to the element */); + auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info))) + .set_data_type(DataType::U32) /* we store the offset to the element */); } const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; @@ -219,20 +246,20 @@ std::pair validate_and_configure_window(ITensorInfo *src, ITenso const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const bool is_square = pool_size_x == pool_size_y; - const unsigned int pooled_w = dst->dimension(idx_width); - const unsigned int pooled_h = dst->dimension(idx_height); + const bool is_square = pool_size_x == pool_size_y; + const unsigned int pooled_w = dst->dimension(idx_width); + const unsigned int pooled_h = dst->dimension(idx_height); //If it's not squared and optimized will be executed the MxN num_elems_processed_per_iteration = 1; - if(is_square) + if (is_square) { - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: - switch(pool_size_x) + switch (pool_size_x) { case 2: num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15; @@ -261,18 +288,22 @@ std::pair validate_and_configure_window(ITensorInfo *src, ITenso bool window_changed = false; Window win{}; // Upper limit for the number of right/bottom border elements that are accessed - TensorShape dst_shape{ src->tensor_shape() }; + TensorShape dst_shape{src->tensor_shape()}; dst_shape.set(0, pooled_w); dst_shape.set(1, pooled_h); TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape)); win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration)); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace -void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) +void CpuPool2dKernel::configure(ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + ITensorInfo *indices) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; @@ -284,14 +315,15 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Update pool size in case of global pooling - const Size2D pool_size( - is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, - is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height); + const Size2D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, + is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size)); - const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first, pool_size, CPUInfo::get().get_isa() }); + const auto *uk = CpuPool2dKernel::get_implementation( + PoolDataTypeISASelectorData{src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first, + pool_size, CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr); // Set instance variables @@ -302,7 +334,7 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin _run_method = uk->ukernel; _name = std::string("CpuPool2dKernel").append("/").append(uk->name); - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { // Configure kernel window Window win = calculate_max_window(*dst, Steps()); @@ -311,14 +343,17 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin else { // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration, - pool_size.x(), pool_size.y()); + auto win_config = validate_and_configure_window( + src, dst, indices, pool_info, _num_elems_processed_per_iteration, pool_size.x(), pool_size.y()); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICpuKernel::configure(win_config.second); } } -Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status CpuPool2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -336,9 +371,10 @@ Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y))); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), - (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, - pool_size_x, pool_size_y) - .first); + (indices) ? indices->clone().get() : nullptr, pool_info, + num_elems_processed_per_iteration, pool_size_x, + pool_size_y) + .first); return Status{}; } @@ -359,19 +395,20 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T const unsigned int pool_size = _pool_info.pool_size.width; Window window_src(window); - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { // Set step for src in x and y direction for the src unsigned int window_x_inc = 0; - switch(src->info()->data_type()) + switch (src->info()->data_type()) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: { window_x_inc = pool_stride_x; - if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3) + if ((pool_size == 2 || pool_size == 3) && pool_stride_x < 3) { - window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration; + window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 + : _num_elems_processed_per_iteration; } break; } @@ -387,8 +424,10 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T ARM_COMPUTE_ERROR("Not supported"); } } - window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc)); - window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y)); + window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, + window.x().end() * pool_stride_x, window_x_inc)); + window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, + window.y().end() * pool_stride_y, pool_stride_y)); } else { diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h index c952ea839d..859de8cc5f 100644 --- a/src/cpu/kernels/CpuPool2dKernel.h +++ b/src/cpu/kernels/CpuPool2dKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_POOL2D_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -38,7 +39,8 @@ namespace kernels class CpuPool2dKernel : public ICpuKernel { private: - using PoolingKernelPtr = std::add_pointer::type; + using PoolingKernelPtr = std::add_pointer::type; public: CpuPool2dKernel() = default; @@ -52,17 +54,21 @@ public: * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. */ - void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + void + configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuPool2dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct PoolingKernel @@ -76,11 +82,11 @@ public: private: PoolingLayerInfo _pool_info{}; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - unsigned int _num_elems_processed_per_iteration{ 0 }; + DataLayout _data_layout{DataLayout::UNKNOWN}; + unsigned int _num_elems_processed_per_iteration{0}; Size2D _pool_size{}; int _pool_stride_x{}; - PoolingKernelPtr _run_method{ nullptr }; + PoolingKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp index 4504f3f7c9..8b484d4e0b 100644 --- a/src/cpu/kernels/CpuPool3dKernel.cpp +++ b/src/cpu/kernels/CpuPool3dKernel.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/pool3d/list.h" @@ -41,39 +42,28 @@ namespace { using namespace misc::shape_calculator; -static const std::vector available_kernels = -{ - { - "neon_qu8_ndhwc_poolMxNxD", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d) - }, - { - "neon_qs8_ndhwc_poolMxNxD", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d) - }, - { - "neon_fp16_ndhwc_poolMxNxD", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16 && data.isa.fp16); }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d) - }, - { - "neon_fp32_ndhwc_poolMxNxD", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d) - } -}; +static const std::vector available_kernels = { + {"neon_qu8_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)}, + {"neon_qs8_ndhwc_poolMxNxD", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)}, + {"neon_fp16_ndhwc_poolMxNxD", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.fp16); }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)}, + {"neon_fp32_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)}}; Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported"); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding - && (pool_info.pool_type == PoolingType::AVG)), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && + (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)), "Exclude padding is unsupported for non-float types for Avg op"); const auto data_layout = src->data_layout(); @@ -97,21 +87,26 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const int output_height = 0; int output_depth = 0; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), + "Pooling region that is entirely outside input tensor is unsupported"); - std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], src->tensor_shape()[idx_depth], - pool_size_x, pool_size_y, pool_size_z, pool_info); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid"); + std::tie(output_width, output_height, output_depth) = + scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], + src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), + "Calculated output dimension size is invalid"); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - TensorInfo out_info(TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC)); + TensorInfo out_info( + TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); } - const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; @@ -136,12 +131,12 @@ void CpuPool3dKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const // Update pool size in case of global pooling const bool is_global_pooling = pool_info.is_global_pooling; - const Size3D pool_size( - is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, - is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height, - is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth); + const Size3D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, + is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height, + is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth); - const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr); // Set instance variables @@ -188,4 +183,4 @@ const std::vector &CpuPool3dKernel::get_availa } // namespace kernels } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h index 437f2af7e4..bd1ff61046 100644 --- a/src/cpu/kernels/CpuPool3dKernel.h +++ b/src/cpu/kernels/CpuPool3dKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_POOL3D_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -39,7 +40,8 @@ class CpuPool3dKernel : public ICpuKernel { private: /* Template function for Pooling 3D NDHWC */ - using Pooling3dKernelPtr = std::add_pointer::type; + using Pooling3dKernelPtr = + std::add_pointer::type; public: CpuPool3dKernel() = default; @@ -68,7 +70,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct Pooling3dKernel @@ -82,11 +84,11 @@ public: private: Pooling3dLayerInfo _pool_info{}; - Pooling3dKernelPtr _run_method{ nullptr }; + Pooling3dKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */ \ No newline at end of file +#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */ diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp index 9700c62318..5dde680837 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.cpp +++ b/src/cpu/kernels/CpuQuantizeKernel.cpp @@ -28,13 +28,13 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "src/core/CPP/Validate.h" #include #include @@ -53,9 +53,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::QASYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); return Status{}; @@ -71,19 +73,15 @@ inline float32x4x4_t load_value(const T *input_ptr) template <> inline float32x4x4_t load_value(const float *input_ptr) { - return { wrapper::vloadq(input_ptr), - wrapper::vloadq(input_ptr + 4), - wrapper::vloadq(input_ptr + 8), - wrapper::vloadq(input_ptr + 12) }; + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8), + wrapper::vloadq(input_ptr + 12)}; } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <> inline float32x4x4_t load_value(const float16_t *input_ptr) { - return { vcvt_f32_f16(wrapper::vload(input_ptr)), - vcvt_f32_f16(wrapper::vload(input_ptr + 4)), - vcvt_f32_f16(wrapper::vload(input_ptr + 8)), - vcvt_f32_f16(wrapper::vload(input_ptr + 12)) }; + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -113,26 +111,25 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - static const std::map quant_map = - { - { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16 }, + static const std::map quant_map = { + {"op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8}, + {"op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8}, + {"op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16}, - { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16 }, + {"op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8}, + {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8}, + {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16}, - { "op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8 }, + {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8}, - { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16 }, + {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8}, + {"op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8}, + {"op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16 }, + {"op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8}, + {"op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8}, + {"op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16}, #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ }; @@ -142,7 +139,7 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) auto it = quant_map.find(function_to_call); - if(it == quant_map.end()) + if (it == quant_map.end()) { ARM_COMPUTE_ERROR("Unsupported combination of input and output data types"); } @@ -167,7 +164,7 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if(is_data_type_quantized_asymmetric(src->info()->data_type())) + if (is_data_type_quantized_asymmetric(src->info()->data_type())) { uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); } @@ -177,22 +174,24 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast(input.ptr()); - auto output_ptr = reinterpret_cast(output.ptr()); - int x = window_start_x; - for(; x <= (window_end_x - window_step); x += window_step) - { - wrapper::vstore(&output_ptr[x], vquantize_qasymm8(load_value(&input_ptr[x]), uqinfo)); - } - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info()); - } - }, - input, output); + auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info()); + } + }, + input, output); } template @@ -203,7 +202,7 @@ void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, c const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if(is_data_type_quantized_asymmetric(src->info()->data_type())) + if (is_data_type_quantized_asymmetric(src->info()->data_type())) { uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); } @@ -219,23 +218,25 @@ void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, c Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast(input.ptr()); - auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step); x += window_step) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - wrapper::vstore(&output_ptr[x], vquantize_qasymm8(load_value(&input_ptr[x]), uqinfo)); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - output_ptr[x] = Qasymm8QuantizationHelper::quantize(input_ptr[x], uqinfo, rounding_policy); - } - }, - input, output); + auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = Qasymm8QuantizationHelper::quantize(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); } template @@ -246,7 +247,7 @@ void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if(is_data_type_quantized_asymmetric(src->info()->data_type())) + if (is_data_type_quantized_asymmetric(src->info()->data_type())) { uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); } @@ -262,25 +263,27 @@ void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast(input.ptr()); - auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step); x += window_step) - { - uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); - vst1q_u16(&output_ptr[x], tmp.val[0]); - vst1q_u16(&output_ptr[x + 8], tmp.val[1]); - } - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); - } - }, - input, output); + auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); + vst1q_u16(&output_ptr[x], tmp.val[0]); + vst1q_u16(&output_ptr[x + 8], tmp.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); } void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h index 2bc8105a11..d6714136da 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.h +++ b/src/cpu/kernels/CpuQuantizeKernel.h @@ -59,7 +59,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -67,7 +67,9 @@ private: * * @param[in] window Region on which to execute the kernel. */ - using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window); + using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, + ITensor *dst, + const Window &window); /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor. * * @param[in] window Region on which to execute the kernel. @@ -84,7 +86,7 @@ private: template void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window); - QuantizeFunctionExecutorPtr _func{ nullptr }; + QuantizeFunctionExecutorPtr _func{nullptr}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp index a9672a8c5e..241e58fbce 100644 --- a/src/cpu/kernels/CpuReshapeKernel.cpp +++ b/src/cpu/kernels/CpuReshapeKernel.cpp @@ -29,9 +29,11 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/NEON/INEKernel.h" + #include "src/core/helpers/Utils.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/INEKernel.h" + #include /** [NEReshapeLayerKernel Kernel] **/ @@ -49,7 +51,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(dst->tensor_shape().total_size() != 0) + if (dst->tensor_shape().total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); @@ -59,29 +61,30 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) return Status{}; } - template void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst) { const TensorShape &src_shape = src->info()->tensor_shape(); const TensorShape &dst_shape = dst->info()->tensor_shape(); - Iterator dst_it(dst, window); + Iterator dst_it(dst, window); - execute_window_loop(window, [&](const Coordinates & dst_coord) - { - Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); - const auto output_ptr = dst->ptr_to_element(dst_coord); - const auto input_ptr = src->ptr_to_element(src_coord); + execute_window_loop( + window, + [&](const Coordinates &dst_coord) + { + Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); + const auto output_ptr = dst->ptr_to_element(dst_coord); + const auto input_ptr = src->ptr_to_element(src_coord); - *reinterpret_cast(output_ptr) = *reinterpret_cast(input_ptr); - }, - dst_it); + *reinterpret_cast(output_ptr) = *reinterpret_cast(input_ptr); + }, + dst_it); } -void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst ) +void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst) { - switch(src->info()->data_type()) + switch (src->info()->data_type()) { case DataType::U8: case DataType::S8: @@ -131,22 +134,24 @@ void reshape_tensor_per_row(const Window &window, const ITensor *src, ITensor *d win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator dst_it(dst, win); - execute_window_loop(win, [&]( Coordinates & id) - { - dst_coord = id; - - for(int x = window_start_x; x < window_end_x; x += src_row_size) + execute_window_loop( + win, + [&](Coordinates &id) { - src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); - output_ptr = dst->ptr_to_element(dst_coord); - input_ptr = src->ptr_to_element(src_coord); + dst_coord = id; - std::memcpy(output_ptr, input_ptr, row_size_in_bytes); + for (int x = window_start_x; x < window_end_x; x += src_row_size) + { + src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); + output_ptr = dst->ptr_to_element(dst_coord); + input_ptr = src->ptr_to_element(src_coord); - dst_coord.increment(Window::DimX, src_row_size); - } - }, - dst_it); + std::memcpy(output_ptr, input_ptr, row_size_in_bytes); + + dst_coord.increment(Window::DimX, src_row_size); + } + }, + dst_it); } void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst) @@ -213,8 +218,8 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors) const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - const ITensorInfo* src_info = src->info(); - const ITensorInfo* dst_info = dst->info(); + const ITensorInfo *src_info = src->info(); + const ITensorInfo *dst_info = dst->info(); // Calculate kernel window based on the padding info Window win; @@ -226,7 +231,7 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors) const auto src_row_size = static_cast(src_info->tensor_shape()[0]); const auto dst_row_size = static_cast(dst_info->tensor_shape()[0]); - if(!src_has_holes && !dst_has_holes) + if (!src_has_holes && !dst_has_holes) { std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info); /* diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h index eddbbf7135..ce566fd9e2 100644 --- a/src/cpu/kernels/CpuReshapeKernel.h +++ b/src/cpu/kernels/CpuReshapeKernel.h @@ -55,7 +55,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; /** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes @@ -84,10 +84,9 @@ public: } private: - size_t _split_dimension{ Window::DimY }; - - std::function _reshape_tensor_fn{}; + size_t _split_dimension{Window::DimY}; + std::function _reshape_tensor_fn{}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp index 332304599f..702e0a8134 100644 --- a/src/cpu/kernels/CpuScaleKernel.cpp +++ b/src/cpu/kernels/CpuScaleKernel.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/InterpolationPolicyUtils.h" #include "arm_compute/core/Window.h" + #include "src/core/common/Registrars.h" #include "src/core/helpers/ScaleHelpers.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,104 +45,74 @@ namespace kernels { namespace { -static const std::vector available_kernels = -{ - { - "sve_fp16_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale) - }, - { - "sve_fp32_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale) - }, - { - "sve_qu8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale) - }, - { - "sve_qs8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale) - }, - { - "sve_u8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale) - }, - { - "sve_s16_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale) - }, - { - "neon_fp16_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale) - }, - { - "neon_fp32_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale) - }, - { - "neon_qu8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale) - }, - { - "neon_qs8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale) - }, - { - "neon_u8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale) - }, - { - "neon_s8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale) - }, - { - "neon_s16_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale) - }, +static const std::vector available_kernels = { + {"sve_fp16_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + data.interpolation_policy != InterpolationPolicy::BILINEAR; + }, + REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)}, + {"sve_fp32_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; }, + REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)}, + {"sve_qu8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8 && data.isa.sve && + data.interpolation_policy != InterpolationPolicy::BILINEAR; + }, + REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)}, + {"sve_qs8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve && + data.interpolation_policy != InterpolationPolicy::BILINEAR; + }, + REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)}, + {"sve_u8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)}, + {"sve_s16_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)}, + {"neon_fp16_scale", + [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale)}, + {"neon_fp32_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale)}, + {"neon_qu8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)}, + {"neon_qs8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)}, + {"neon_u8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)}, + {"neon_s8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)}, + {"neon_s16_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)}, }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, - const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, + const ScaleKernelInfo &info) { - const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy }); + const auto *uk = CpuScaleKernel::get_implementation( + ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(dst == src); - ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1); - ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT); + ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && + info.sampling_policy != SamplingPolicy::TOP_LEFT); ARM_COMPUTE_UNUSED(info.constant_border_value); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported"); @@ -153,27 +124,30 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0); ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0); - ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) && (data_layout != DataLayout::NHWC || info.interpolation_policy != InterpolationPolicy::BILINEAR - || info.border_mode != BorderMode::REPLICATE)); + ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) && + (data_layout != DataLayout::NHWC || + info.interpolation_policy != InterpolationPolicy::BILINEAR || + info.border_mode != BorderMode::REPLICATE)); - if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr) + if (info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); } - if(info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr) + if (info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); - if(dx != nullptr && dy != nullptr) + if (dx != nullptr && dy != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32); } } - ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); + ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && + !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); - if(info.interpolation_policy == InterpolationPolicy::AREA) + if (info.interpolation_policy == InterpolationPolicy::AREA) { ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8); @@ -183,24 +157,28 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I } } // namespace -void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, - ITensorInfo *dst, const ScaleKernelInfo &info) +void CpuScaleKernel::configure(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, + const ScaleKernelInfo &info) { ARM_COMPUTE_UNUSED(dx, dy, offsets); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, - dx, - dy, - offsets, - dst, - info)); - - const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy }); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dx, dy, offsets, dst, info)); + + const auto *uk = CpuScaleKernel::get_implementation( + ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _run_method = uk->ukernel; - _name = std::string("CpuScaleKernel").append("/").append(uk->name).append("_").append(string_from_interpolation_policy(info.interpolation_policy)); + _name = std::string("CpuScaleKernel") + .append("/") + .append(uk->name) + .append("_") + .append(string_from_interpolation_policy(info.interpolation_policy)); // Get data layout and width/height indices _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; @@ -212,19 +190,22 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co _constant_border_value = info.constant_border_value; _align_corners = info.align_corners; - if(info.sampling_policy == SamplingPolicy::CENTER) + if (info.sampling_policy == SamplingPolicy::CENTER) { _sampling_offset = 0.5f; } // Compute the ratio between source width/height and destination width/height - const auto wr = scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners); - const auto hr = scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners); + const auto wr = + scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy; + _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR + : _policy; - if(_border_mode == BorderMode::UNDEFINED) + if (_border_mode == BorderMode::UNDEFINED) { _border_mode = BorderMode::CONSTANT; _constant_border_value = PixelValue(); @@ -232,39 +213,38 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co #ifdef ENABLE_NCHW_KERNELS // Configure scale function to run - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { std::string function_to_call("scale_"); function_to_call += string_from_data_type(src->data_type()) + "_"; function_to_call += string_from_data_layout(_data_layout) + "_"; function_to_call += string_from_interpolation_policy(_policy); - static std::map map_function = - { - { "scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8 }, + static std::map map_function = { + {"scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8}, - { "scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw }, - { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, + {"scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw}, + {"scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw}, - { "scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm }, - { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, + {"scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm}, + {"scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw}, - { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm }, - { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, + {"scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm}, + {"scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw}, - { "scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw }, - { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, + {"scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw}, + {"scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { "scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw }, - { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, + {"scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw}, + {"scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw}, #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - { "scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw }, - { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, + {"scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw}, + {"scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw}, }; auto it = map_function.find(function_to_call); - if(it != map_function.end()) + if (it != map_function.end()) { _func = it->second; } @@ -278,13 +258,19 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co #ifdef ENABLE_NCHW_KERNELS template -void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy); const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); // Don't increment in X and Y direction for the input tensor // A pointer to the start of this plane is needed as base for the precomputed offsets @@ -296,7 +282,7 @@ void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const Window win_off; win_off.set(Window::DimX, window[Window::DimX]); win_off.set(Window::DimY, window[Window::DimY]); - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) { win_off.set(d, Window::Dimension(0, 0, 0)); } @@ -305,24 +291,33 @@ void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const Iterator src_i(src, win_in); Iterator dst_i(dst, window); Iterator offsets_i(offsets, win_off); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets_i.ptr()); - const auto in_yi = static_cast(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor(( - id.y() + _sampling_offset) - * hr)); - const int32_t offset_row = in_yi * in_stride_x; - *reinterpret_cast(dst_i.ptr()) = *(reinterpret_cast(src_i.ptr()) + offsets_ptr[0] + offset_row); - }, - src_i, offsets_i, dst_i); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offsets_ptr = reinterpret_cast(offsets_i.ptr()); + const auto in_yi = static_cast( + _align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) + : std::floor((id.y() + _sampling_offset) * hr)); + const int32_t offset_row = in_yi * in_stride_x; + *reinterpret_cast(dst_i.ptr()) = + *(reinterpret_cast(src_i.ptr()) + offsets_ptr[0] + offset_row); + }, + src_i, offsets_i, dst_i); } template -void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window) { // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); - Window win_off; + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); + Window win_off; win_off.set(Window::DimX, window.x()); win_off.set(Window::DimY, window.y()); @@ -332,7 +327,7 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) { win_off.set(d, Window::Dimension(0, 0, 0)); } @@ -347,7 +342,7 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const const int32_t in_dim_h = src->info()->dimension(1); const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right; - if(_border_mode == BorderMode::CONSTANT) + if (_border_mode == BorderMode::CONSTANT) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC using ConstType = typename std::conditional::value, half, T>::type; @@ -355,52 +350,60 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const using ConstType = T; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ const T const_border_value = static_cast(_constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const auto index_w = *(reinterpret_cast(offsets_i.ptr())); - const auto dx_val = *(reinterpret_cast(dx_i.ptr())); - const auto dy_val = *(reinterpret_cast(dy_i.ptr())); - const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); - - const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value; - const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value; - const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h - && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h - && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) : - const_border_value; - - *reinterpret_cast(dst_i.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - src_i, offsets_i, dx_i, dy_i, dst_i); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); + const auto index_w = *(reinterpret_cast(offsets_i.ptr())); + const auto dx_val = *(reinterpret_cast(dx_i.ptr())); + const auto dy_val = *(reinterpret_cast(dy_i.ptr())); + const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) + : const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) + : const_border_value; + + *reinterpret_cast(dst_i.ptr()) = + static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + src_i, offsets_i, dx_i, dy_i, dst_i); } - else if(_border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const auto index_w = *(reinterpret_cast(offsets_i.ptr())); - const auto dx_val = *(reinterpret_cast(dx_i.ptr())); - const auto dy_val = *(reinterpret_cast(dy_i.ptr())); - const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); - - auto clamped_x = utility::clamp(index_w, 0, in_dim_w - 1); - auto clamped_x1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); - auto clamped_y = utility::clamp(index_h, 0, in_dim_h - 1); - auto clamped_y1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); - - const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w); - const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w); - const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w); - const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w); - - *reinterpret_cast(dst_i.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - src_i, offsets_i, dx_i, dy_i, dst_i); + else if (_border_mode == BorderMode::REPLICATE) + { + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); + const auto index_w = *(reinterpret_cast(offsets_i.ptr())); + const auto dx_val = *(reinterpret_cast(dx_i.ptr())); + const auto dy_val = *(reinterpret_cast(dy_i.ptr())); + const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); + + auto clamped_x = utility::clamp(index_w, 0, in_dim_w - 1); + auto clamped_x1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); + auto clamped_y = utility::clamp(index_h, 0, in_dim_h - 1); + auto clamped_y1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w); + const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w); + const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w); + const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w); + + *reinterpret_cast(dst_i.ptr()) = + static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + src_i, offsets_i, dx_i, dy_i, dst_i); } else { @@ -408,7 +411,12 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const } } -void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, offsets); using namespace scale_helpers; @@ -425,50 +433,60 @@ void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const Iterator src_i(src, win_in); Iterator dst_i(dst, window); - const auto wr = scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners); - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); + const auto wr = + scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); const auto w = src->info()->dimension(0); const auto h = src->info()->dimension(1); const size_t in_stride = src->info()->strides_in_bytes()[1]; - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(src_i.ptr()); - - uint8x8_t tmp0 = vdup_n_u8(0); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7); - - uint8x8_t tmp1 = vdup_n_u8(0); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7); - - vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1)); - }, - src_i, dst_i); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto in_ptr = reinterpret_cast(src_i.ptr()); + + uint8x8_t tmp0 = vdup_n_u8(0); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7); + + uint8x8_t tmp1 = vdup_n_u8(0); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7); + + vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1)); + }, + src_i, dst_i); } template -void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window) { // Get data layout and width/height indices const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), _align_corners); + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), + dst->info()->dimension(idx_height), _align_corners); Window win_off; win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); @@ -479,7 +497,7 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con win_in.set(idx_width, Window::Dimension(0, 0, 0)); win_in.set(idx_height, Window::Dimension(0, 0, 0)); - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) { win_off.set(d, Window::Dimension(0, 0, 0)); } @@ -495,7 +513,7 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - if(_border_mode == BorderMode::CONSTANT) + if (_border_mode == BorderMode::CONSTANT) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC using ConstType = typename std::conditional::value, half, T>::type; @@ -503,62 +521,74 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con using ConstType = T; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ const T const_border_value = static_cast(_constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); - - const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(dst_i.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - src_i, dst_i); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); + const int32_t index_w = *(reinterpret_cast( + offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = + *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = + *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) + : const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) + : const_border_value; + + const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); + *reinterpret_cast(dst_i.ptr()) = Qasymm8QuantizationHelper::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + src_i, dst_i); } - else if(_border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); - - auto clamped_w = utility::clamp(index_w, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(index_h, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); - - const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); - const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); - const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); - const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(dst_i.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - src_i, dst_i); + else if (_border_mode == BorderMode::REPLICATE) + { + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); + const int32_t index_w = *(reinterpret_cast( + offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = + *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = + *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); + + auto clamped_w = utility::clamp(index_w, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp(index_h, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); + const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); + const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); + const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); + + const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); + *reinterpret_cast(dst_i.ptr()) = Qasymm8QuantizationHelper::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + src_i, dst_i); } else { @@ -567,8 +597,12 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con } #endif // ENABLE_NCHW_KERNELS -Status CpuScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy, - const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info) +Status CpuScaleKernel::validate(const ITensorInfo *input, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *output, + const ScaleKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info)); return Status{}; @@ -588,13 +622,14 @@ void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const Th const auto dy = tensors.get_const_tensor(TensorType::ACL_INT_1); const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2); - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { (this->*_func)(src, dst, dx, dy, offsets, window); } else { - _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window); + _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, + _align_corners, window); } } diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h index 8102142fc3..38142df021 100644 --- a/src/cpu/kernels/CpuScaleKernel.h +++ b/src/cpu/kernels/CpuScaleKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_SCALEKERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -39,9 +40,19 @@ class CpuScaleKernel : public ICpuKernel { private: /** Scale function to use for the particular function to use */ - using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window); - using ScaleKernelPtr = std::add_pointer::type; + using ScaleFunctionPtr = void (CpuScaleKernel::*)( + const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window); + using ScaleKernelPtr = std::add_pointer::type; public: CpuScaleKernel() = default; @@ -59,7 +70,11 @@ public: * @param[out] dst Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info @ref ScaleKernelInfo to use for configuration */ - void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, + void configure(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, const ScaleKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * @@ -67,11 +82,15 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, + static Status validate(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, const ScaleKernelInfo &info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct ScaleKernel @@ -89,28 +108,48 @@ private: * * @note Used only in case down-sampling. */ - void scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); + void scale_area_nchw_u8(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window); /** function to perform scale using bilinear interpolation on the given window */ template - void scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); + void scale_bilinear_nchw(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window); /** function to perform scale using bilinear interpolation on the given window */ template - void scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); + void scale_bilinear_qasymm(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window); /** function to perform scale using nearest neighbour on the given window */ template - void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); + void scale_nearest_nchw(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window); #endif // ENABLE_NCHW_KERNELS - ScaleFunctionPtr _func{ nullptr }; + ScaleFunctionPtr _func{nullptr}; InterpolationPolicy _policy{}; BorderMode _border_mode{}; PixelValue _constant_border_value{}; - float _sampling_offset{ 0 }; - bool _align_corners{ false }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - ScaleKernelPtr _run_method{ nullptr }; + float _sampling_offset{0}; + bool _align_corners{false}; + DataLayout _data_layout{DataLayout::UNKNOWN}; + ScaleKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp index e06ab9917c..ce144351f8 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.cpp +++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp @@ -30,11 +30,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" #include "src/cpu/kernels/softmax/list.h" namespace arm_compute @@ -46,61 +46,44 @@ namespace kernels namespace { /* Softmax Logits 1D Max - identifying the max value of 1D Logits */ -static const std::vector available_kernels_max_logits = -{ - { - "sve_fp32_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; }, - REGISTER_FP32_SVE(sve_fp32_logits) - }, - { - "sve_fp16_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, - REGISTER_FP16_SVE(sve_fp16_logits) - }, - { - "sve_qu8_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; }, - REGISTER_QASYMM8_SVE(sve_qasymm8_logits) - }, - { - "sve_qs8_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; }, - REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits) - }, - { - "neon_fp32_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(neon_fp32_logits) - }, - { - "neon_fp16_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_logits) - }, - { - "neon_qu8_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(neon_qasymm8_logits) - }, - { - "neon_qs8_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits) - }, +static const std::vector available_kernels_max_logits = { + {"sve_fp32_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, + REGISTER_FP32_SVE(sve_fp32_logits)}, + {"sve_fp16_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, + REGISTER_FP16_SVE(sve_fp16_logits)}, + {"sve_qu8_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; }, + REGISTER_QASYMM8_SVE(sve_qasymm8_logits)}, + {"sve_qs8_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; }, + REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)}, + {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_logits)}, + {"neon_fp16_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_logits)}, + {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(neon_qasymm8_logits)}, + {"neon_qs8_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)}, }; Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); // Validate in case of configured output - if(output.total_size() != 0) + if (output.total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), + TensorShape(input.tensor_shape()).set(0, 1)); } return Status{}; @@ -121,7 +104,7 @@ void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) // Output auto initialization if not yet initialized auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); - const auto *uk = get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); _run_method = uk->ukernel; @@ -158,60 +141,46 @@ const char *CpuLogits1DMaxKernel::name() const } /* Softmax Logits 1D - computation for QASYMM8 with pre-computed max. */ -template -static const std::vector::SoftmaxLogits1DKernel> available_kernels_logits = -{ - { - "sve2_qu8_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, - REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax) - }, - { - "sve2_qs8_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, - REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax) - }, - { - "sve_fp32_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; }, - REGISTER_FP32_SVE(sve_fp32_softmax) - }, - { - "sve_fp16_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, - REGISTER_FP16_SVE(sve_fp16_softmax) - }, - - { - "neon_fp32_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(neon_fp32_softmax) - }, - { - "neon_fp16_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_softmax) - }, - { - "neon_qu8_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax) - }, - { - "neon_qs8_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax) - }, +template +static const std::vector::SoftmaxLogits1DKernel> available_kernels_logits = { + {"sve2_qu8_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, + REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)}, + {"sve2_qs8_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, + REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)}, + {"sve_fp32_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, + REGISTER_FP32_SVE(sve_fp32_softmax)}, + {"sve_fp16_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, + REGISTER_FP16_SVE(sve_fp16_softmax)}, + + {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_softmax)}, + {"neon_fp16_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_softmax)}, + {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)}, + {"neon_qs8_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)}, }; namespace { -Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max, - const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log) +Status validate_arguments_logits_softmax(const ITensorInfo &src, + const ITensorInfo &max, + const ITensorInfo &dst, + const float beta, + const ITensorInfo &tmp, + bool is_log) { ARM_COMPUTE_UNUSED(beta); // Check input ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); @@ -221,16 +190,18 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max); // Check output if configured - if(dst.total_size() != 0) + if (dst.total_size() != 0) { - const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info(); + const QuantizationInfo output_quantization = + is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) + : dst.quantization_info(); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization); } // Check tmp if configured - if(tmp.total_size() != 0) + if (tmp.total_size() != 0) { const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type(); ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type); @@ -243,14 +214,16 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn } } // namespace -template -const std::vector::SoftmaxLogits1DKernel> &CpuLogits1DSoftmaxKernel::get_available_kernels() +template +const std::vector::SoftmaxLogits1DKernel> & +CpuLogits1DSoftmaxKernel::get_available_kernels() { return available_kernels_logits; } template -void CpuLogits1DSoftmaxKernel::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) +void CpuLogits1DSoftmaxKernel::configure( + const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); @@ -259,17 +232,21 @@ void CpuLogits1DSoftmaxKernel::configure(const ITensorInfo *src, const I const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); // Output auto initialization if not yet initialized - const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info(); + const QuantizationInfo output_quantization = + is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) + : dst->quantization_info(); auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding()); // Tmp auto initialization if not yet initialized const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); - const auto *uk = CpuLogits1DSoftmaxKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = CpuLogits1DSoftmaxKernel::get_implementation( + DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); + std::string kernel_name = + IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); _beta = beta; _run_method = uk->ukernel; @@ -282,8 +259,8 @@ void CpuLogits1DSoftmaxKernel::configure(const ITensorInfo *src, const I } template -Status CpuLogits1DSoftmaxKernel::validate(const ITensorInfo *src, const ITensorInfo *max, - const ITensorInfo *dst, const float beta, const ITensorInfo *tmp) +Status CpuLogits1DSoftmaxKernel::validate( + const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); @@ -305,7 +282,7 @@ void CpuLogits1DSoftmaxKernel::run_op(ITensorPack &tensors, const Window auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); - const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; + const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); @@ -314,7 +291,7 @@ void CpuLogits1DSoftmaxKernel::run_op(ITensorPack &tensors, const Window } template -const char *CpuLogits1DSoftmaxKernel::name() const +const char *CpuLogits1DSoftmaxKernel::name() const { return _name.c_str(); } diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h index 59f43bd1d2..5d288179fd 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.h +++ b/src/cpu/kernels/CpuSoftmaxKernel.h @@ -57,7 +57,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct SoftmaxLogits1DMaxKernel @@ -70,7 +70,7 @@ public: static const std::vector &get_available_kernels(); private: - SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr }; + SoftmaxLogits1DMaxKernelPtr _run_method{nullptr}; std::string _name{}; }; @@ -79,7 +79,8 @@ template class CpuLogits1DSoftmaxKernel : public ICpuKernel> { private: - using SoftmaxLogits1DKernelPtr = std::add_pointer::type; + using SoftmaxLogits1DKernelPtr = std::add_pointer::type; public: CpuLogits1DSoftmaxKernel() = default; @@ -95,18 +96,22 @@ public: * * @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input. */ - void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp); + void + configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuLogits1DSoftmaxKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *max, - const ITensorInfo *dst, const float beta, const ITensorInfo *tmp); + static Status validate(const ITensorInfo *src, + const ITensorInfo *max, + const ITensorInfo *dst, + const float beta, + const ITensorInfo *tmp); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct SoftmaxLogits1DKernel @@ -119,8 +124,8 @@ public: static const std::vector &get_available_kernels(); private: - float _beta{ 1.0f }; - SoftmaxLogits1DKernelPtr _run_method{ nullptr }; + float _beta{1.0f}; + SoftmaxLogits1DKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp index 875d613dca..2b2c6f2e92 100644 --- a/src/cpu/kernels/CpuSubKernel.cpp +++ b/src/cpu/kernels/CpuSubKernel.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/add/generic/neon/impl.h" @@ -51,70 +52,48 @@ namespace using CpuSubKernelDataTypeISASelectorData = CpuAddKernelDataTypeISASelectorData; using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr; -static const std::vector available_kernels = -{ - { - "neon_fp32_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon) - }, - { - "neon_fp16_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon) - }, - { - "neon_u8_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) - }, - { - "neon_s16_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) - }, - { - "neon_s32_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) - }, - { - "neon_qu8_sub_fixedpoint", - [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint) - }, - { - "neon_qs8_sub_fixedpoint", - [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint) - }, - { - "neon_qu8_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon) - }, - { - "neon_qs8_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon) - }, - { - "neon_qs16_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon) - }, +static const std::vector available_kernels = { + {"neon_fp32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon)}, + {"neon_fp16_sub", + [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon)}, + {"neon_u8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon)}, + {"neon_s16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon)}, + {"neon_s32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon)}, + {"neon_qu8_sub_fixedpoint", + [](const CpuSubKernelDataTypeISASelectorData &data) + { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)}, + {"neon_qs8_sub_fixedpoint", + [](const CpuSubKernelDataTypeISASelectorData &data) + { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)}, + {"neon_qu8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)}, + {"neon_qs8_sub", + [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)}, + {"neon_qs16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)}, }; -inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) +inline Status +validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) { ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, - DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, + DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst); - const auto uk = CpuSubKernel::get_implementation(CpuSubKernelDataTypeISASelectorData{ src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint }); + const auto uk = CpuSubKernel::get_implementation( + CpuSubKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); @@ -125,7 +104,7 @@ inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src "Convert policy cannot be WRAP if datatype is quantized"); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), @@ -147,7 +126,8 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I set_data_type_if_unknown(*dst, src0->data_type()); const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(src0, src1, dst); - const auto uk = CpuSubKernel::get_implementation(CpuSubKernelDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint }); + const auto uk = CpuSubKernel::get_implementation( + CpuSubKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); @@ -167,14 +147,14 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const ARM_COMPUTE_UNUSED(thread_count); #if defined(ENABLE_FP32_KERNELS) - if(this->_run_method == &sub_same_neon) + if (this->_run_method == &sub_same_neon) { size_t mws = ICPPKernel::default_mws; - if(platform.get_cpu_model() == CPUModel::N1) + if (platform.get_cpu_model() == CPUModel::N1) { mws = default_mws_N1_fp32_neon; } - else if(platform.get_cpu_model() == CPUModel::V1) + else if (platform.get_cpu_model() == CPUModel::V1) { mws = default_mws_V1_fp32_neon; } @@ -184,7 +164,7 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const } // tensor is 1D or was re-interpreted as 1D - if(this->window().shape().num_dimensions() == 1) + if (this->window().shape().num_dimensions() == 1) { return mws; } @@ -203,7 +183,8 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const return ICPPKernel::default_mws; } -Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) +Status +CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy)); diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h index cd209d1837..5fa0dc411a 100644 --- a/src/cpu/kernels/CpuSubKernel.h +++ b/src/cpu/kernels/CpuSubKernel.h @@ -37,7 +37,8 @@ namespace kernels class CpuSubKernel : public ICpuKernel { private: - using SubKernelPtr = std::add_pointer::type; + using SubKernelPtr = std::add_pointer::type; using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr; public: @@ -68,7 +69,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); + static Status + validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; @@ -99,9 +101,9 @@ public: private: ConvertPolicy _policy{}; - SubKernelPtr _run_method{ nullptr }; + SubKernelPtr _run_method{nullptr}; std::string _name{}; - size_t _split_dimension{ Window::DimY }; + size_t _split_dimension{Window::DimY}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuTransposeKernel.cpp b/src/cpu/kernels/CpuTransposeKernel.cpp index b2cebc4230..615bc6ce1e 100644 --- a/src/cpu/kernels/CpuTransposeKernel.cpp +++ b/src/cpu/kernels/CpuTransposeKernel.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -45,7 +46,7 @@ namespace { unsigned int num_elems_processed(size_t element_size) { - switch(element_size) + switch (element_size) { case 1: return 8; @@ -81,10 +82,10 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind Window window_in(window); window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); - if(left_over_loop_y) + if (left_over_loop_y) { // Check if window_end_y_multiple_of is greater than window_start_y - if(window_end_y_multiple_of > window_start_y) + if (window_end_y_multiple_of > window_start_y) { window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); } @@ -101,87 +102,121 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind Iterator output(out, window_out); // Run the SIMD path if and only if the input is not a row-vector - if(in->info()->dimension(1) != 1) + if (in->info()->dimension(1) != 1) { Iterator input(in, window_in); - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Compute 8x8 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + window_in, + [&](const Coordinates &id) { - const uint8x8_t row0 = vld1_u8(reinterpret_cast(input.ptr() + x + 0 * input_stride_in_bytes)); - const uint8x8_t row1 = vld1_u8(reinterpret_cast(input.ptr() + x + 1 * input_stride_in_bytes)); - const uint8x8_t row2 = vld1_u8(reinterpret_cast(input.ptr() + x + 2 * input_stride_in_bytes)); - const uint8x8_t row3 = vld1_u8(reinterpret_cast(input.ptr() + x + 3 * input_stride_in_bytes)); - const uint8x8_t row4 = vld1_u8(reinterpret_cast(input.ptr() + x + 4 * input_stride_in_bytes)); - const uint8x8_t row5 = vld1_u8(reinterpret_cast(input.ptr() + x + 5 * input_stride_in_bytes)); - const uint8x8_t row6 = vld1_u8(reinterpret_cast(input.ptr() + x + 6 * input_stride_in_bytes)); - const uint8x8_t row7 = vld1_u8(reinterpret_cast(input.ptr() + x + 7 * input_stride_in_bytes)); - - // Transpose 2x2 - const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1); - const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3); - const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5); - const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7); - - // Transpose 4x4 - const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0])); - const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1])); - const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0])); - const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1])); - - // Transpose 8x8 - const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0])); - const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1])); - const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0])); - const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1])); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; - - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1]))); - } - - // Compute left-over elements along the x dimension (1x8) - for(; x < window_end_x; ++x) - { - const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes); - const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes); - const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes); - const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes); - const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes); - const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes); - const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes); - const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes); - - uint8x8_t result = vdup_n_u8(0); - result = vset_lane_u8(val0, result, 0); - result = vset_lane_u8(val1, result, 1); - result = vset_lane_u8(val2, result, 2); - result = vset_lane_u8(val3, result, 3); - result = vset_lane_u8(val4, result, 4); - result = vset_lane_u8(val5, result, 5); - result = vset_lane_u8(val6, result, 6); - result = vset_lane_u8(val7, result, 7); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; - - vst1_u8(output.ptr() + dst_offset_in_bytes, result); - } - }, - input, output); + // Compute 8x8 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x8_t row0 = + vld1_u8(reinterpret_cast(input.ptr() + x + 0 * input_stride_in_bytes)); + const uint8x8_t row1 = + vld1_u8(reinterpret_cast(input.ptr() + x + 1 * input_stride_in_bytes)); + const uint8x8_t row2 = + vld1_u8(reinterpret_cast(input.ptr() + x + 2 * input_stride_in_bytes)); + const uint8x8_t row3 = + vld1_u8(reinterpret_cast(input.ptr() + x + 3 * input_stride_in_bytes)); + const uint8x8_t row4 = + vld1_u8(reinterpret_cast(input.ptr() + x + 4 * input_stride_in_bytes)); + const uint8x8_t row5 = + vld1_u8(reinterpret_cast(input.ptr() + x + 5 * input_stride_in_bytes)); + const uint8x8_t row6 = + vld1_u8(reinterpret_cast(input.ptr() + x + 6 * input_stride_in_bytes)); + const uint8x8_t row7 = + vld1_u8(reinterpret_cast(input.ptr() + x + 7 * input_stride_in_bytes)); + + // Transpose 2x2 + const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1); + const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3); + const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5); + const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7); + + // Transpose 4x4 + const uint16x4x2_t k0_u16 = + vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0])); + const uint16x4x2_t k1_u16 = + vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1])); + const uint16x4x2_t k2_u16 = + vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0])); + const uint16x4x2_t k3_u16 = + vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1])); + + // Transpose 8x8 + const uint32x2x2_t k0_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0])); + const uint32x2x2_t k1_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1])); + const uint32x2x2_t k2_u32 = + vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0])); + const uint32x2x2_t k3_u32 = + vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1])); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; + + vst1_u8( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0]))); + vst1_u8( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0]))); + vst1_u8( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0]))); + vst1_u8( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0]))); + vst1_u8( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1]))); + vst1_u8( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1]))); + vst1_u8( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1]))); + vst1_u8( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1]))); + } + + // Compute left-over elements along the x dimension (1x8) + for (; x < window_end_x; ++x) + { + const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes); + const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes); + const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes); + const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes); + const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes); + const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes); + const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes); + const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes); + + uint8x8_t result = vdup_n_u8(0); + result = vset_lane_u8(val0, result, 0); + result = vset_lane_u8(val1, result, 1); + result = vset_lane_u8(val2, result, 2); + result = vset_lane_u8(val3, result, 3); + result = vset_lane_u8(val4, result, 4); + result = vset_lane_u8(val5, result, 5); + result = vset_lane_u8(val6, result, 6); + result = vset_lane_u8(val7, result, 7); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; + + vst1_u8(output.ptr() + dst_offset_in_bytes, result); + } + }, + input, output); } - if(left_over_loop_y) + if (left_over_loop_y) { window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); @@ -190,16 +225,18 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind Iterator output(out, window_out); // Compute left-over elements along the y dimension (1x1) - execute_window_loop(window_in, [&](const Coordinates & id) - { - const uint8_t val0 = *input.ptr(); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint8_t val0 = *input.ptr(); - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes; + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes; - *(output.ptr() + dst_offset_in_bytes) = val0; - }, - input, output); + *(output.ptr() + dst_offset_in_bytes) = val0; + }, + input, output); } } @@ -220,10 +257,10 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win Window window_in(window); window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); - if(left_over_loop_y) + if (left_over_loop_y) { // Check if window_end_y_multiple_of is greater than window_start_y - if(window_end_y_multiple_of > window_start_y) + if (window_end_y_multiple_of > window_start_y) { window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); } @@ -240,61 +277,77 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Run the SIMD path if and only if the input is not a row-vector - if(in->info()->dimension(1) != 1) + if (in->info()->dimension(1) != 1) { Iterator input(in, window_in); - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Compute 4x4 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + window_in, + [&](const Coordinates &id) { - const uint16x4_t row0 = vld1_u16(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint16x4_t row1 = vld1_u16(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint16x4_t row2 = vld1_u16(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint16x4_t row3 = vld1_u16(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); - - // Transpose 2x2 - const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1); - const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3); - - // Transpose 4x4 - const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0])); - const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1])); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; - - vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0])); - vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0])); - vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1])); - vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1])); - } - - // Compute left-over elements (1x4) - for(; x < window_end_x; ++x) - { - const uint16_t val0 = *(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint16_t val1 = *(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint16_t val2 = *(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint16_t val3 = *(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); - - uint16x4_t result = vdup_n_u16(0); - result = vset_lane_u16(val0, result, 0); - result = vset_lane_u16(val1, result, 1); - result = vset_lane_u16(val2, result, 2); - result = vset_lane_u16(val3, result, 3); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; - - vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes), result); - } - }, - input, output); + // Compute 4x4 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x4_t row0 = + vld1_u16(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint16x4_t row1 = + vld1_u16(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint16x4_t row2 = + vld1_u16(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint16x4_t row3 = + vld1_u16(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); + + // Transpose 2x2 + const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1); + const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3); + + // Transpose 4x4 + const uint32x2x2_t k0_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0])); + const uint32x2x2_t k1_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1])); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; + + vst1_u16( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + vreinterpret_u16_u32(k0_u32.val[0])); + vst1_u16( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + vreinterpret_u16_u32(k1_u32.val[0])); + vst1_u16( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + vreinterpret_u16_u32(k0_u32.val[1])); + vst1_u16( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + vreinterpret_u16_u32(k1_u32.val[1])); + } + + // Compute left-over elements (1x4) + for (; x < window_end_x; ++x) + { + const uint16_t val0 = *(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint16_t val1 = *(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint16_t val2 = *(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint16_t val3 = *(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); + + uint16x4_t result = vdup_n_u16(0); + result = vset_lane_u16(val0, result, 0); + result = vset_lane_u16(val1, result, 1); + result = vset_lane_u16(val2, result, 2); + result = vset_lane_u16(val3, result, 3); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; + + vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes), result); + } + }, + input, output); } - if(left_over_loop_y) + if (left_over_loop_y) { window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); @@ -303,16 +356,18 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Compute left-over elements along the y dimension (1x1) - execute_window_loop(window_in, [&](const Coordinates & id) - { - const uint16_t val0 = *(reinterpret_cast(input.ptr())); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint16_t val0 = *(reinterpret_cast(input.ptr())); - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes; + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes; - *(reinterpret_cast(output.ptr() + dst_offset_in_bytes)) = val0; - }, - input, output); + *(reinterpret_cast(output.ptr() + dst_offset_in_bytes)) = val0; + }, + input, output); } } @@ -347,10 +402,10 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win Window window_in(window); window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); - if(left_over_loop_y) + if (left_over_loop_y) { // Check if window_end_y_multiple_of is greater than window_start_y - if(window_end_y_multiple_of > window_start_y) + if (window_end_y_multiple_of > window_start_y) { window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); } @@ -367,102 +422,160 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Run the SIMD path if and only if the input is not a row-vector - if(in->info()->dimension(1) != 1) + if (in->info()->dimension(1) != 1) { Iterator input(in, window_in); - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Compute 8x8 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + window_in, + [&](const Coordinates &id) { - // Load - const uint32x4x2_t row0 = vld1q_u32_x2_(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint32x4x2_t row1 = vld1q_u32_x2_(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint32x4x2_t row2 = vld1q_u32_x2_(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint32x4x2_t row3 = vld1q_u32_x2_(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); - const uint32x4x2_t row4 = vld1q_u32_x2_(reinterpret_cast(input.ptr() + 4 * input_stride_in_bytes) + x); - const uint32x4x2_t row5 = vld1q_u32_x2_(reinterpret_cast(input.ptr() + 5 * input_stride_in_bytes) + x); - const uint32x4x2_t row6 = vld1q_u32_x2_(reinterpret_cast(input.ptr() + 6 * input_stride_in_bytes) + x); - const uint32x4x2_t row7 = vld1q_u32_x2_(reinterpret_cast(input.ptr() + 7 * input_stride_in_bytes) + x); - - // Transpose 2x4 - const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]), vtrn2q_u32(row0.val[0], row1.val[0])}; - const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]), vtrn2q_u32(row0.val[1], row1.val[1])}; - const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]), vtrn2q_u32(row2.val[0], row3.val[0])}; - const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]), vtrn2q_u32(row2.val[1], row3.val[1])}; - const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]), vtrn2q_u32(row4.val[0], row5.val[0])}; - const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]), vtrn2q_u32(row4.val[1], row5.val[1])}; - const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]), vtrn2q_u32(row6.val[0], row7.val[0])}; - const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]), vtrn2q_u32(row6.val[1], row7.val[1])}; - - // Transpose 2x2 - const uint64x2x2_t k0_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))}; - const uint64x2x2_t k1_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))}; - const uint64x2x2_t k2_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))}; - const uint64x2x2_t k3_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))}; - const uint64x2x2_t k4_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))}; - const uint64x2x2_t k5_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))}; - const uint64x2x2_t k6_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))}; - const uint64x2x2_t k7_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))}; - - // Swap blocks - const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]), vreinterpretq_u32_u64(k4_u64.val[0])}; - const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]), vreinterpretq_u32_u64(k5_u64.val[0])}; - const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]), vreinterpretq_u32_u64(k4_u64.val[1])}; - const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]), vreinterpretq_u32_u64(k5_u64.val[1])}; - const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]), vreinterpretq_u32_u64(k6_u64.val[0])}; - const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]), vreinterpretq_u32_u64(k7_u64.val[0])}; - const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]), vreinterpretq_u32_u64(k6_u64.val[1])}; - const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]), vreinterpretq_u32_u64(k7_u64.val[1])}; - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; - - // Store - vst1q_u32_x2_(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), col0); - vst1q_u32_x2_(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), col1); - vst1q_u32_x2_(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), col2); - vst1q_u32_x2_(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), col3); - vst1q_u32_x2_(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), col4); - vst1q_u32_x2_(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), col5); - vst1q_u32_x2_(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), col6); - vst1q_u32_x2_(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), col7); - } - - // Compute left-over elements (8x1) - for(; x < window_end_x; ++x) - { - const uint32_t val0 = *(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint32_t val1 = *(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint32_t val2 = *(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint32_t val3 = *(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); - const uint32_t val4 = *(reinterpret_cast(input.ptr() + 4 * input_stride_in_bytes) + x); - const uint32_t val5 = *(reinterpret_cast(input.ptr() + 5 * input_stride_in_bytes) + x); - const uint32_t val6 = *(reinterpret_cast(input.ptr() + 6 * input_stride_in_bytes) + x); - const uint32_t val7 = *(reinterpret_cast(input.ptr() + 7 * input_stride_in_bytes) + x); - - uint32x4_t result0 = vdupq_n_u32(0); - uint32x4_t result1 = vdupq_n_u32(0); - result0 = vsetq_lane_u32(val0, result0, 0); - result0 = vsetq_lane_u32(val1, result0, 1); - result0 = vsetq_lane_u32(val2, result0, 2); - result0 = vsetq_lane_u32(val3, result0, 3); - result1 = vsetq_lane_u32(val4, result1, 0); - result1 = vsetq_lane_u32(val5, result1, 1); - result1 = vsetq_lane_u32(val6, result1, 2); - result1 = vsetq_lane_u32(val7, result1, 3); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; - - vst1q_u32_x2_(reinterpret_cast(output.ptr() + dst_offset_in_bytes), {result0, result1}); - } - }, - input, output); + // Compute 8x8 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load + const uint32x4x2_t row0 = + vld1q_u32_x2_(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32x4x2_t row1 = + vld1q_u32_x2_(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32x4x2_t row2 = + vld1q_u32_x2_(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32x4x2_t row3 = + vld1q_u32_x2_(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); + const uint32x4x2_t row4 = + vld1q_u32_x2_(reinterpret_cast(input.ptr() + 4 * input_stride_in_bytes) + x); + const uint32x4x2_t row5 = + vld1q_u32_x2_(reinterpret_cast(input.ptr() + 5 * input_stride_in_bytes) + x); + const uint32x4x2_t row6 = + vld1q_u32_x2_(reinterpret_cast(input.ptr() + 6 * input_stride_in_bytes) + x); + const uint32x4x2_t row7 = + vld1q_u32_x2_(reinterpret_cast(input.ptr() + 7 * input_stride_in_bytes) + x); + + // Transpose 2x4 + const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]), + vtrn2q_u32(row0.val[0], row1.val[0])}; + const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]), + vtrn2q_u32(row0.val[1], row1.val[1])}; + const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]), + vtrn2q_u32(row2.val[0], row3.val[0])}; + const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]), + vtrn2q_u32(row2.val[1], row3.val[1])}; + const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]), + vtrn2q_u32(row4.val[0], row5.val[0])}; + const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]), + vtrn2q_u32(row4.val[1], row5.val[1])}; + const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]), + vtrn2q_u32(row6.val[0], row7.val[0])}; + const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]), + vtrn2q_u32(row6.val[1], row7.val[1])}; + + // Transpose 2x2 + const uint64x2x2_t k0_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))}; + const uint64x2x2_t k1_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))}; + const uint64x2x2_t k2_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))}; + const uint64x2x2_t k3_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))}; + const uint64x2x2_t k4_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))}; + const uint64x2x2_t k5_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))}; + const uint64x2x2_t k6_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))}; + const uint64x2x2_t k7_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))}; + + // Swap blocks + const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]), + vreinterpretq_u32_u64(k4_u64.val[0])}; + const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]), + vreinterpretq_u32_u64(k5_u64.val[0])}; + const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]), + vreinterpretq_u32_u64(k4_u64.val[1])}; + const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]), + vreinterpretq_u32_u64(k5_u64.val[1])}; + const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]), + vreinterpretq_u32_u64(k6_u64.val[0])}; + const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]), + vreinterpretq_u32_u64(k7_u64.val[0])}; + const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]), + vreinterpretq_u32_u64(k6_u64.val[1])}; + const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]), + vreinterpretq_u32_u64(k7_u64.val[1])}; + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + // Store + vst1q_u32_x2_( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + col0); + vst1q_u32_x2_( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + col1); + vst1q_u32_x2_( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + col2); + vst1q_u32_x2_( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + col3); + vst1q_u32_x2_( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), + col4); + vst1q_u32_x2_( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), + col5); + vst1q_u32_x2_( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), + col6); + vst1q_u32_x2_( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), + col7); + } + + // Compute left-over elements (8x1) + for (; x < window_end_x; ++x) + { + const uint32_t val0 = *(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32_t val1 = *(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32_t val2 = *(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32_t val3 = *(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); + const uint32_t val4 = *(reinterpret_cast(input.ptr() + 4 * input_stride_in_bytes) + x); + const uint32_t val5 = *(reinterpret_cast(input.ptr() + 5 * input_stride_in_bytes) + x); + const uint32_t val6 = *(reinterpret_cast(input.ptr() + 6 * input_stride_in_bytes) + x); + const uint32_t val7 = *(reinterpret_cast(input.ptr() + 7 * input_stride_in_bytes) + x); + + uint32x4_t result0 = vdupq_n_u32(0); + uint32x4_t result1 = vdupq_n_u32(0); + result0 = vsetq_lane_u32(val0, result0, 0); + result0 = vsetq_lane_u32(val1, result0, 1); + result0 = vsetq_lane_u32(val2, result0, 2); + result0 = vsetq_lane_u32(val3, result0, 3); + result1 = vsetq_lane_u32(val4, result1, 0); + result1 = vsetq_lane_u32(val5, result1, 1); + result1 = vsetq_lane_u32(val6, result1, 2); + result1 = vsetq_lane_u32(val7, result1, 3); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + vst1q_u32_x2_(reinterpret_cast(output.ptr() + dst_offset_in_bytes), {result0, result1}); + } + }, + input, output); } - if(left_over_loop_y) + if (left_over_loop_y) { window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); @@ -471,40 +584,42 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Compute left-over elements along the y dimension (1x1) - execute_window_loop(window_in, [&](const Coordinates & id) - { - const uint32_t val0 = *(reinterpret_cast(input.ptr())); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint32_t val0 = *(reinterpret_cast(input.ptr())); - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; - *(reinterpret_cast(output.ptr() + dst_offset_in_bytes)) = val0; - }, - input, output); + *(reinterpret_cast(output.ptr() + dst_offset_in_bytes)) = val0; + }, + input, output); } } #else // __aarch64__ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window) { - const int window_step_x = 4; - const int window_step_y = 4; - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_start_y = window.y().start(); - const int window_end_y = std::min(window.y().end(), static_cast(in->info()->dimension(1))); - const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; - const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; - const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + const int window_step_x = 4; + const int window_step_y = 4; + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_start_y = window.y().start(); + const int window_end_y = std::min(window.y().end(), static_cast(in->info()->dimension(1))); + const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; // Check if we need a left-over loop for the y dimension bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); Window window_in(window); window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); - if(left_over_loop_y) + if (left_over_loop_y) { // Check if window_end_y_multiple_of is greater than window_start_y - if(window_end_y_multiple_of > window_start_y) + if (window_end_y_multiple_of > window_start_y) { window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); } @@ -521,60 +636,74 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Run the SIMD path if and only if the input is not a row-vector - if(in->info()->dimension(1) != 1) + if (in->info()->dimension(1) != 1) { Iterator input(in, window_in); - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Compute 4x4 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint32x4_t row0 = vld1q_u32(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint32x4_t row1 = vld1q_u32(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint32x4_t row2 = vld1q_u32(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint32x4_t row3 = vld1q_u32(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); - - // Transpose 2x2 - const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1)); - const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3)); - const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1)); - const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3)); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; - - // Swap block 01 with block 10 and store - vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0])); - vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1])); - vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0])); - vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1])); - } - - // Compute left-over elements (1x4) - for(; x < window_end_x; ++x) + execute_window_loop( + window_in, + [&](const Coordinates &id) { - const uint32_t val0 = *(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint32_t val1 = *(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint32_t val2 = *(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint32_t val3 = *(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); - - uint32x4_t result = vdupq_n_u32(0); - result = vsetq_lane_u32(val0, result, 0); - result = vsetq_lane_u32(val1, result, 1); - result = vsetq_lane_u32(val2, result, 2); - result = vsetq_lane_u32(val3, result, 3); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; - - vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes), result); - } - }, - input, output); + // Compute 4x4 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint32x4_t row0 = + vld1q_u32(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32x4_t row1 = + vld1q_u32(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32x4_t row2 = + vld1q_u32(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32x4_t row3 = + vld1q_u32(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); + + // Transpose 2x2 + const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1)); + const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3)); + const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1)); + const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3)); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + // Swap block 01 with block 10 and store + vst1q_u32( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + vcombine_u32(k0_u32.val[0], k3_u32.val[0])); + vst1q_u32( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + vcombine_u32(k0_u32.val[1], k3_u32.val[1])); + vst1q_u32( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + vcombine_u32(k2_u32.val[0], k1_u32.val[0])); + vst1q_u32( + reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + vcombine_u32(k2_u32.val[1], k1_u32.val[1])); + } + + // Compute left-over elements (1x4) + for (; x < window_end_x; ++x) + { + const uint32_t val0 = *(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32_t val1 = *(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32_t val2 = *(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32_t val3 = *(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); + + uint32x4_t result = vdupq_n_u32(0); + result = vsetq_lane_u32(val0, result, 0); + result = vsetq_lane_u32(val1, result, 1); + result = vsetq_lane_u32(val2, result, 2); + result = vsetq_lane_u32(val3, result, 3); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes), result); + } + }, + input, output); } - if(left_over_loop_y) + if (left_over_loop_y) { window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); @@ -583,16 +712,18 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Compute left-over elements along the y dimension (1x1) - execute_window_loop(window_in, [&](const Coordinates & id) - { - const uint32_t val0 = *(reinterpret_cast(input.ptr())); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint32_t val0 = *(reinterpret_cast(input.ptr())); - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; - *(reinterpret_cast(output.ptr() + dst_offset_in_bytes)) = val0; - }, - input, output); + *(reinterpret_cast(output.ptr() + dst_offset_in_bytes)) = val0; + }, + input, output); } } #endif // __aarch64__ @@ -616,7 +747,8 @@ void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size()); // Configure kernel window - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped Coordinates coord; @@ -637,7 +769,7 @@ Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *d "Element size not supported"); // Validate configured destination - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); @@ -658,7 +790,7 @@ void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cons const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - switch(src->info()->element_size()) + switch (src->info()->element_size()) { case 1: transpose_8bit_elements(src, dst, window); diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h index cb85daeb40..e79a405677 100644 --- a/src/cpu/kernels/CpuTransposeKernel.h +++ b/src/cpu/kernels/CpuTransposeKernel.h @@ -54,7 +54,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp index 2ccc977995..297ba63826 100644 --- a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp +++ b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -38,7 +39,7 @@ namespace { TensorShape get_output_shape(const ITensorInfo *src, bool has_bias) { - TensorShape output_shape{ src->tensor_shape() }; + TensorShape output_shape{src->tensor_shape()}; output_shape.collapse(3); const size_t tmp_dim = output_shape[0]; @@ -54,20 +55,22 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, con //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type())); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1)); ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2)); ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3])); - ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || biases->dimension(1) != src->tensor_shape()[4])); + ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || + biases->dimension(1) != src->tensor_shape()[4])); } // Checks performed when output is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + get_output_shape(src, biases != nullptr)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); } @@ -84,9 +87,7 @@ void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInf auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr)))); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, - biases, - dst)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst)); // Configure kernel Window window = calculate_max_window(*src, Steps()); @@ -122,44 +123,47 @@ void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, // Create iterators Iterator in(src, window); - execute_window_loop(window, [&](const Coordinates & id) - { - // Get column index - const int kernel_idx = id[3]; - const int kernel_idz = id[4]; - - // Setup pointers - const uint8_t *tmp_input_ptr = in.ptr(); - uint8_t *tmp_output_ptr = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz)); - const uint8_t *curr_input_row_ptr = tmp_input_ptr; - const uint8_t *curr_input_depth_ptr = tmp_input_ptr; - - // Linearize volume - for(unsigned int d = 0; d < kernel_depth; ++d) + execute_window_loop( + window, + [&](const Coordinates &id) { - for(unsigned int j = 0; j < kernel_size_y; ++j) + // Get column index + const int kernel_idx = id[3]; + const int kernel_idz = id[4]; + + // Setup pointers + const uint8_t *tmp_input_ptr = in.ptr(); + uint8_t *tmp_output_ptr = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz)); + const uint8_t *curr_input_row_ptr = tmp_input_ptr; + const uint8_t *curr_input_depth_ptr = tmp_input_ptr; + + // Linearize volume + for (unsigned int d = 0; d < kernel_depth; ++d) { - for(unsigned int i = 0; i < kernel_size_x; ++i) + for (unsigned int j = 0; j < kernel_size_y; ++j) { - std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size()); - tmp_input_ptr += input_stride_x; - tmp_output_ptr += output_stride_y; + for (unsigned int i = 0; i < kernel_size_x; ++i) + { + std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size()); + tmp_input_ptr += input_stride_x; + tmp_output_ptr += output_stride_y; + } + curr_input_row_ptr += input_stride_y; + tmp_input_ptr = curr_input_row_ptr; } - curr_input_row_ptr += input_stride_y; - tmp_input_ptr = curr_input_row_ptr; + curr_input_depth_ptr += input_stride_z; + curr_input_row_ptr = curr_input_depth_ptr; + tmp_input_ptr = curr_input_depth_ptr; } - curr_input_depth_ptr += input_stride_z; - curr_input_row_ptr = curr_input_depth_ptr; - tmp_input_ptr = curr_input_depth_ptr; - } - // Add bias - if(biases != nullptr) - { - std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), src->info()->element_size()); - } - }, - in); + // Add bias + if (biases != nullptr) + { + std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), + src->info()->element_size()); + } + }, + in); } const char *CpuWeightsReshapeKernel::name() const { @@ -167,4 +171,4 @@ const char *CpuWeightsReshapeKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h index 1a260edc96..9310b3c784 100644 --- a/src/cpu/kernels/CpuWeightsReshapeKernel.h +++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h @@ -82,7 +82,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp index 818d878119..52e3f2549c 100644 --- a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp +++ b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp @@ -28,8 +28,10 @@ namespace arm_compute { namespace cpu { -CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads) - : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads } +CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads) + : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads} { } @@ -49,24 +51,20 @@ void CpuWinogradConv2dTransformInputKernel::run_op(ITensorPack &tensors, const W const size_t input_row_stride = src_strides[height_idx] / element_size_in_bytes; const size_t input_col_stride = src_strides[width_idx] / element_size_in_bytes; const size_t input_batch_stride = src_strides[batch_idx] / element_size_in_bytes; - const auto input_nhwc_ptr = reinterpret_cast(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes()); - auto win_transf_ptr = reinterpret_cast(winograd_input_transform->buffer() + winograd_input_transform->info()->offset_first_element_in_bytes()); + const auto input_nhwc_ptr = + reinterpret_cast(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes()); + auto win_transf_ptr = reinterpret_cast(winograd_input_transform->buffer() + + winograd_input_transform->info()->offset_first_element_in_bytes()); - _winograd_impl.input_transform->execute( - _conv_args, - input_nhwc_ptr, - input_batch_stride, - input_row_stride, - input_col_stride, - win_transf_ptr, - _winograd_impl.winograd_spec, - workspace->buffer(), - info.thread_id, - _nthreads); + _winograd_impl.input_transform->execute(_conv_args, input_nhwc_ptr, input_batch_stride, input_row_stride, + input_col_stride, win_transf_ptr, _winograd_impl.winograd_spec, + workspace->buffer(), info.thread_id, _nthreads); } -CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads) - : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads } +CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads) + : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads} { } @@ -88,28 +86,21 @@ void CpuWinogradConv2dTransformOutputKernel::run_op(ITensorPack &tensors, const const size_t out_row_stride = dst_strides[height_idx] / element_size_in_bytes; const size_t out_col_stride = dst_strides[width_idx] / element_size_in_bytes; const size_t out_batch_stride = dst_strides[batch_idx] / element_size_in_bytes; - const auto wout_transf_ptr = reinterpret_cast(winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes()); - auto dst_nhwc_ptr = reinterpret_cast(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes()); - void *biases_data_ptr = nullptr; - if(biases != nullptr) + const auto wout_transf_ptr = reinterpret_cast( + winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes()); + auto dst_nhwc_ptr = + reinterpret_cast(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes()); + void *biases_data_ptr = nullptr; + if (biases != nullptr) { biases_data_ptr = reinterpret_cast(biases->buffer() + biases->info()->offset_first_element_in_bytes()); } // Output transform - _winograd_impl.output_transform->execute( - _conv_args, - wout_transf_ptr, - _winograd_impl.winograd_spec, - biases_data_ptr, - dst_nhwc_ptr, - out_batch_stride, - out_row_stride, - out_col_stride, - workspace->buffer(), - info.thread_id, - _nthreads); + _winograd_impl.output_transform->execute(_conv_args, wout_transf_ptr, _winograd_impl.winograd_spec, biases_data_ptr, + dst_nhwc_ptr, out_batch_stride, out_row_stride, out_col_stride, + workspace->buffer(), info.thread_id, _nthreads); } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h index 0170dcae22..8a3b745e85 100644 --- a/src/cpu/kernels/CpuWinogradConv2dKernel.h +++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h @@ -30,6 +30,7 @@ #include "arm_compute/core/Steps.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/NEON/kernels/assembly/winograd.hpp" #include "src/core/NEON/kernels/convolution/common/tensor.hpp" #include "src/cpu/ICpuKernel.h" @@ -53,7 +54,9 @@ public: /** Prevent instances of this class from being moved it contains references.*/ CpuWinogradConv2dTransformInputKernel &operator=(CpuWinogradConv2dTransformInputKernel &&) = delete; - CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads); + CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; @@ -83,7 +86,9 @@ public: /** Prevent instances of this class from being moved it contains references.*/ CpuWinogradConv2dTransformOutputKernel &operator=(CpuWinogradConv2dTransformOutputKernel &&) = delete; - CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads); + CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; @@ -95,7 +100,7 @@ public: private: arm_conv::winograd::WinogradImpl &_winograd_impl; - const arm_conv::ConvolutionArgs &_conv_args; + const arm_conv::ConvolutionArgs &_conv_args; uint32_t _nthreads; }; diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp index e51b5b3423..ddc6dc24cd 100644 --- a/src/cpu/kernels/activation/generic/neon/fp16.cpp +++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp @@ -31,7 +31,7 @@ namespace cpu { namespace { -constexpr ActFpImplParams Fp16Params = { static_cast(1e-7), 8 }; +constexpr ActFpImplParams Fp16Params = {static_cast(1e-7), 8}; } // namespace void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) @@ -40,4 +40,4 @@ void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLaye } } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp index 2a3b8a0bfd..e558f8c73e 100644 --- a/src/cpu/kernels/activation/generic/neon/fp32.cpp +++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp @@ -29,7 +29,7 @@ namespace cpu { namespace { -constexpr ActFpImplParams Fp32Params = { static_cast(1e-24), 4 }; +constexpr ActFpImplParams Fp32Params = {static_cast(1e-24), 4}; } // namespace void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { diff --git a/src/cpu/kernels/activation/generic/neon/impl.h b/src/cpu/kernels/activation/generic/neon/impl.h index 05885d8476..afeb6f7f3d 100644 --- a/src/cpu/kernels/activation/generic/neon/impl.h +++ b/src/cpu/kernels/activation/generic/neon/impl.h @@ -24,6 +24,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -56,10 +57,14 @@ inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &ma #endif /* __aarch64__ */ template -void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void fp_neon_activation_impl(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { /** SIMD vector tag type. */ - using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t; + using ExactTagType = + typename arm_compute::wrapper::traits::neon_bitvector_tag_t; constexpr int window_step_x = P.step_x; const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); @@ -72,12 +77,12 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL // to prevent NAN values caused by zeros in inputs to SQRT. // In case of aarh64, we call vsqrt directly, so we don't use delta. #ifndef __aarch64__ - const auto delta = wrapper::vdup_n(static_cast(P.delta), ExactTagType {}); + const auto delta = wrapper::vdup_n(static_cast(P.delta), ExactTagType{}); #else /* #ifndef __aarch64__ */ - const auto const_inv_2 = wrapper::vdup_n(static_cast(0.5f), ExactTagType {}); + const auto const_inv_2 = wrapper::vdup_n(static_cast(0.5f), ExactTagType{}); const auto const_inv_sqrt_2 = wrapper::vdup_n(static_cast(0.70710678118f), ExactTagType{}); #endif /* __aarch64__ */ - const auto const_1 = wrapper::vdup_n(static_cast(1.f), ExactTagType {}); + const auto const_1 = wrapper::vdup_n(static_cast(1.f), ExactTagType{}); const auto const_0 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); const auto const_6 = wrapper::vdup_n(static_cast(6.f), ExactTagType{}); const auto const_3 = wrapper::vdup_n(static_cast(3.f), ExactTagType{}); @@ -88,143 +93,154 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL const auto vb = wrapper::vdup_n(static_cast(act_info.b()), ExactTagType{}); const auto a = static_cast(act_info.a()); const auto b = static_cast(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - wrapper::traits::neon_bitvector_t tmp; - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(input_ptr + x); - switch(act) + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + wrapper::traits::neon_bitvector_t tmp; + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = wrapper::vabs(vin); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = wrapper::vmla(vb, va, vin); - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = wrapper::vmax(const_0, vin); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)))); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1))); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: + const auto vin = wrapper::vloadq(input_ptr + x); + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = wrapper::vabs(vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = wrapper::vmla(vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = wrapper::vmax(const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, + wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, + wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: #ifdef __aarch64__ - tmp = wrapper::vsqrt(vin); + tmp = wrapper::vsqrt(vin); #else /* __aarch64__ */ { const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{})); - tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask)))); - tmp = mask_float_vector(tmp, wrapper::vnot(bitmask)); + tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask)))); + tmp = mask_float_vector(tmp, wrapper::vnot(bitmask)); } #endif /* __aarch64__ */ - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = wrapper::vmul(vin, vin); - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = vin; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3))))); - break; - case ActivationLayerInfo::ActivationFunction::SWISH: - tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin)))))); - break; + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = wrapper::vmul(vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = wrapper::vmul( + vin, + wrapper::vmul(const_inv_6, + wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3))))); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd( + const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin)))))); + break; #ifdef __aarch64__ - case ActivationLayerInfo::ActivationFunction::GELU: - tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_2, wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2))))); - break; + case ActivationLayerInfo::ActivationFunction::GELU: + tmp = wrapper::vmul( + vin, + wrapper::vmul(const_inv_2, + wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2))))); + break; #endif /* __aarch64__ */ - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); } - wrapper::vstore(output_ptr + x, tmp); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const T in = *(reinterpret_cast(input_ptr + x)); - T tmp; - switch(act) + // Compute left-over elements + for (; x < window_end_x; ++x) { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = std::abs(in); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = a * in + b; - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = static_cast(1) / (static_cast(1) + std::exp(-in)); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = std::max(static_cast(0), in); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = std::min(a, std::max(static_cast(0), in)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = std::min(a, std::max(b, in)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = (in > 0) ? in : a * in; - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = (in > soft_relu_thresh) ? in : std::log(static_cast(1) + std::exp(in)); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = (in >= 0) ? in : a * (std::exp(in) - 1); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: - tmp = std::sqrt(in); - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = in * in; - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = a * std::tanh(b * in); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = in; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f); - break; - case ActivationLayerInfo::ActivationFunction::SWISH: - tmp = in / (static_cast(1) + std::exp(-a * in)); - break; - case ActivationLayerInfo::ActivationFunction::GELU: - tmp = in * static_cast(0.5f * (1.0f + erff(static_cast(in) / 1.41421356237f))); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); + const T in = *(reinterpret_cast(input_ptr + x)); + T tmp; + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = std::abs(in); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = a * in + b; + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = static_cast(1) / (static_cast(1) + std::exp(-in)); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = std::max(static_cast(0), in); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = std::min(a, std::max(static_cast(0), in)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = std::min(a, std::max(b, in)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = (in > 0) ? in : a * in; + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = (in > soft_relu_thresh) ? in : std::log(static_cast(1) + std::exp(in)); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = (in >= 0) ? in : a * (std::exp(in) - 1); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = std::sqrt(in); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = in * in; + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = a * std::tanh(b * in); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = in; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = in / (static_cast(1) + std::exp(-a * in)); + break; + case ActivationLayerInfo::ActivationFunction::GELU: + tmp = in * static_cast(0.5f * (1.0f + erff(static_cast(in) / 1.41421356237f))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; } - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp index c973e964e4..f289c80d4b 100644 --- a/src/cpu/kernels/activation/generic/neon/lut.cpp +++ b/src/cpu/kernels/activation/generic/neon/lut.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/kernels/lut/list.h" namespace arm_compute @@ -33,19 +34,22 @@ namespace cpu #ifdef __aarch64__ void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { - ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED); + ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && + src->info()->data_type() != DataType::QASYMM8_SIGNED); const auto window_end_x = window.x().end(); Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - auto output_ptr = reinterpret_cast(output.ptr()); - lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr); - }, - input, output); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); + lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr); + }, + input, output); } #endif // __aarch64__ } // namespace cpu diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp index e7c146e46f..1451301ea2 100644 --- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -38,7 +39,10 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void neon_qasymm8_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { constexpr int window_step_x = 16; const auto window_start_x = static_cast(window.x().start()); @@ -85,206 +89,222 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL float32x4_t vs = vdupq_n_f32(s); float32x4_t vo = vdupq_n_f32(o); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - wrapper::traits::neon_bitvector_t tmp; + wrapper::traits::neon_bitvector_t tmp; - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = vmaxq_u8(vconst_0, vin); - // Re-quantize to new output space - tmp = vmlaq_qasymm8(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - // Perform activation - tmp = vminq_u8(va, vmaxq_u8(vb, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8(tmp, vs, vo); - } + const auto vin = wrapper::vloadq(input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = vmaxq_u8(vconst_0, vin); + // Re-quantize to new output space + tmp = vmlaq_qasymm8(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_u8(va, vmaxq_u8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8(tmp, vs, vo); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) { - { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))), - } - }; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } #endif // __aarch64__ - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::TANH) { - { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), - } - }; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) { - { - wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), - wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), - wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), - wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), - } - }; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - const auto vin_deq = vdequantize(vin, qi_in); - - const uint32x4x4_t pos_mask = + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vmul( + vin_deq.val[0], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), + wrapper::vmul( + vin_deq.val[1], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), + wrapper::vmul( + vin_deq.val[2], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), + wrapper::vmul( + vin_deq.val[3], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) { - { + const auto vin_deq = vdequantize(vin, qi_in); + + const uint32x4x4_t pos_mask = {{ wrapper::vcgt(vin_deq.val[0], vconst_0_f32), wrapper::vcgt(vin_deq.val[1], vconst_0_f32), wrapper::vcgt(vin_deq.val[2], vconst_0_f32), wrapper::vcgt(vin_deq.val[3], vconst_0_f32), - } - }; + }}; - const float32x4x4_t tmp_dep = - { - { + const float32x4x4_t tmp_dep = {{ wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), - } - }; + }}; - tmp = vquantize(tmp_dep, qi_out); - } + tmp = vquantize(tmp_dep, qi_out); + } #else // #ifndef __aarch64__ - else if (act == ActivationLayerInfo::ActivationFunction::GELU) - { - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::GELU) { - { - wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[0], const_inv_sqrt_2))))), - wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[1], const_inv_sqrt_2))))), - wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[2], const_inv_sqrt_2))))), - wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[3], const_inv_sqrt_2))))), - } - }; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vmul(vin_deq.val[0], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[0], const_inv_sqrt_2))))), + wrapper::vmul(vin_deq.val[1], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[1], const_inv_sqrt_2))))), + wrapper::vmul(vin_deq.val[2], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[2], const_inv_sqrt_2))))), + wrapper::vmul(vin_deq.val[3], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[3], const_inv_sqrt_2))))), + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } #endif // __aarch64__ - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); } - wrapper::vstore(output_ptr + x, tmp); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - qasymm8_t in = *(reinterpret_cast(input_ptr + x)); - qasymm8_t tmp = 0; - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - tmp = std::max(const_0, in); - tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + // Compute left-over elements + for (; x < window_end_x; ++x) { - tmp = std::min(a, std::max(const_0, in)); - tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - tmp = std::min(a, std::max(b, in)); - tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); - } + qasymm8_t in = *(reinterpret_cast(input_ptr + x)); + qasymm8_t tmp = 0; + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = 1.f / (1.f + std::exp(-tmp_f)); - tmp = quantize_qasymm8(tmp_f, qi_out); - } + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qasymm8(tmp_f, qi_out); + } #endif // __aarch64__ - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); - tmp = quantize_qasymm8(tmp_f, qi_out); - } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qasymm8(tmp_f, qi_out); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); - tmp = quantize_qasymm8(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; - tmp = quantize_qasymm8(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::GELU) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f)); - tmp = quantize_qasymm8(tmp_f, qi_out); - } + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); + tmp = quantize_qasymm8(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; + tmp = quantize_qasymm8(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::GELU) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f)); + tmp = quantize_qasymm8(tmp_f, qi_out); + } #endif // __aarch64__ - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; } - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp index 52c396459b..a2f588245a 100644 --- a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -36,7 +37,10 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void neon_qasymm8_signed_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { constexpr int window_step_x = 16; const auto window_start_x = static_cast(window.x().start()); @@ -76,191 +80,195 @@ void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti float32x4_t vs = vdupq_n_f32(s); float32x4_t vo = vdupq_n_f32(o); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - wrapper::traits::neon_bitvector_t tmp; + wrapper::traits::neon_bitvector_t tmp; - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = vmaxq_s8(vconst_0, vin); - // Re-quantize to new output space - tmp = vmlaq_qasymm8_signed(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8_signed(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - // Perform activation - tmp = vminq_s8(va, vmaxq_s8(vb, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8_signed(tmp, vs, vo); - } + const auto vin = wrapper::vloadq(input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = vmaxq_s8(vconst_0, vin); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_s8(va, vmaxq_s8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed(tmp, vs, vo); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) { - { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))), - } - }; - // Re-quantize to new output space - tmp = vquantize_signed(tmp_dep, qi_out); - } + }}; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } #endif // __aarch64__ - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::TANH) { - { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), - } - }; - // Re-quantize to new output space - tmp = vquantize_signed(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + }}; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) { - { - wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), - wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), - wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), - wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), - } - }; - // Re-quantize to new output space - tmp = vquantize_signed(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - const auto vin_deq = vdequantize(vin, qi_in); + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vmul( + vin_deq.val[0], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), + wrapper::vmul( + vin_deq.val[1], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), + wrapper::vmul( + vin_deq.val[2], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), + wrapper::vmul( + vin_deq.val[3], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), + }}; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + const auto vin_deq = vdequantize(vin, qi_in); #ifdef __aarch64__ - const uint32x4x4_t pos_mask = - { - { + const uint32x4x4_t pos_mask = {{ wrapper::vcgtz(vin_deq.val[0]), wrapper::vcgtz(vin_deq.val[1]), wrapper::vcgtz(vin_deq.val[2]), wrapper::vcgtz(vin_deq.val[3]), - } - }; + }}; #else // __aarch64__ - const uint32x4x4_t pos_mask = - { - { + const uint32x4x4_t pos_mask = {{ wrapper::vcgt(vin_deq.val[0], vconst_0_f32), wrapper::vcgt(vin_deq.val[1], vconst_0_f32), wrapper::vcgt(vin_deq.val[2], vconst_0_f32), wrapper::vcgt(vin_deq.val[3], vconst_0_f32), - } - }; + }}; #endif // __aarch64__ - const float32x4x4_t tmp_dep = - { - { + const float32x4x4_t tmp_dep = {{ wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), - } - }; + }}; - tmp = vquantize_signed(tmp_dep, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + tmp = vquantize_signed(tmp_dep, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); } - wrapper::vstore(output_ptr + x, tmp); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - qasymm8_signed_t in = *(reinterpret_cast(input_ptr + x)); - qasymm8_signed_t tmp = 0; - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - tmp = std::max(const_0, in); - tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - tmp = std::min(a, std::max(const_0, in)); - tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Compute left-over elements + for (; x < window_end_x; ++x) { - tmp = std::min(a, std::max(b, in)); - tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); - } + qasymm8_signed_t in = *(reinterpret_cast(input_ptr + x)); + qasymm8_signed_t tmp = 0; + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = utility::clamp(support::cpp11::lround(tmp * s + o)); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = 1.f / (1.f + std::exp(-tmp_f)); - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } #endif // __aarch64__ - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; } - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp index 2aea6cba3c..891646ea00 100644 --- a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp +++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp @@ -21,11 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/NEMath.h" #include "src/core/NEON/NESymm.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -38,7 +39,10 @@ namespace arm_compute { namespace cpu { -void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void neon_qsymm16_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { constexpr int window_step_x = 8; const auto window_start_x = static_cast(window.x().start()); @@ -59,103 +63,94 @@ void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationL const float a_f32 = act_info.a(); const float b_f32 = act_info.b(); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - wrapper::traits::neon_bitvector_t tmp; - ARM_COMPUTE_UNUSED(tmp); + wrapper::traits::neon_bitvector_t tmp; + ARM_COMPUTE_UNUSED(tmp); - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - // De-quantize - const auto vin_deq = vdequantize_int16(vin, qi_in.scale); - // Perform activation - const float32x4x2_t tmp_dep = + const auto vin = wrapper::vloadq(input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) { - { + // De-quantize + const auto vin_deq = vdequantize_int16(vin, qi_in.scale); + // Perform activation + const float32x4x2_t tmp_dep = {{ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), - } - }; - // Re-quantize to new output space - tmp = vquantize_int16(tmp_dep, qi_out.scale); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = vdequantize_int16(vin, qi_in.scale); - // Perform activation - const float32x4x2_t tmp_dep = + }}; + // Re-quantize to new output space + tmp = vquantize_int16(tmp_dep, qi_out.scale); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) { - { + // De-quantize + const auto vin_deq = vdequantize_int16(vin, qi_in.scale); + // Perform activation + const float32x4x2_t tmp_dep = {{ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), - } - }; - // Re-quantize to new output space - tmp = vquantize_int16(tmp_dep, qi_out.scale); - } + }}; + // Re-quantize to new output space + tmp = vquantize_int16(tmp_dep, qi_out.scale); + } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // De-quantize - const auto vin_deq = vdequantize_int16(vin, qi_in.scale); - // Perform activation - const float32x4x2_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { - { - wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])), - wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1])) - } - }; - // Re-quantize to new output space - tmp = vquantize_int16(tmp_dep, qi_out.scale); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + // De-quantize + const auto vin_deq = vdequantize_int16(vin, qi_in.scale); + // Perform activation + const float32x4x2_t tmp_dep = {{wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])), + wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))}}; + // Re-quantize to new output space + tmp = vquantize_int16(tmp_dep, qi_out.scale); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); } - wrapper::vstore(output_ptr + x, tmp); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - qsymm16_t in = *(reinterpret_cast(input_ptr + x)); - qsymm16_t tmp = 0; - if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - float tmp_f = dequantize_qsymm16(in, qi_in.scale); - tmp_f = 1.f / (1.f + std::exp(-tmp_f)); - tmp = quantize_qsymm16(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) + // Compute left-over elements + for (; x < window_end_x; ++x) { - float tmp_f = dequantize_qsymm16(in, qi_in.scale); - tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); - tmp = quantize_qsymm16(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - float tmp_f = dequantize_qsymm16(in, qi_in.scale); - tmp_f = std::min(a_f32, std::max(b_f32, tmp_f)); - tmp = quantize_qsymm16(tmp_f, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + qsymm16_t in = *(reinterpret_cast(input_ptr + x)); + qsymm16_t tmp = 0; + if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qsymm16(in, qi_in.scale); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qsymm16(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qsymm16(in, qi_in.scale); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qsymm16(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + float tmp_f = dequantize_qsymm16(in, qi_in.scale); + tmp_f = std::min(a_f32, std::max(b_f32, tmp_f)); + tmp = quantize_qsymm16(tmp_f, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; } - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp index 4757c60d8f..97399e01e0 100644 --- a/src/cpu/kernels/activation/generic/sve/fp16.cpp +++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp @@ -29,11 +29,11 @@ #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" -#include -#include - #include "src/core/NEON/SVEMath.h" + #include +#include +#include namespace arm_compute { @@ -59,77 +59,87 @@ void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayer const auto va = svdup_n_f16(act_info.a()); const auto vb = svdup_n_f16(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - svfloat16_t tmp; + svfloat16_t tmp; - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do - { - const auto vin = svld1_f16(pg, input_ptr + x); - switch(act) + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = svabs_f16_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = svmla_f16_z(pg, vb, va, vin); - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin)))); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = svmax_f16_z(pg, const_0, vin); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0)); - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin))); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1))); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: - tmp = svsqrt_f16_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = svmul_f16_z(pg, vin, vin); - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin))); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = vin; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3))))); - break; - case ActivationLayerInfo::ActivationFunction::SWISH: - tmp = svmul_f16_z(pg, vin, svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin)))))); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - svst1_f16(pg, output_ptr + x, tmp); + const auto vin = svld1_f16(pg, input_ptr + x); + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = svabs_f16_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = svmla_f16_z(pg, vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = svmax_f16_z(pg, const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), + svmax_f16_z(pg, vin, const_0)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, + svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = svsqrt_f16_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = svmul_f16_z(pg, vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = svmul_f16_z( + pg, vin, + svmul_f16_z( + pg, const_inv_6, + svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3))))); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = svmul_f16_z( + pg, vin, + svinv_f16_z(pg, svadd_f16_z(pg, const_1, + svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin)))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + svst1_f16(pg, output_ptr + x, tmp); - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - input, output); + } while (svptest_any(svptrue_b16(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp index 87f04c255a..d1b075d52c 100644 --- a/src/cpu/kernels/activation/generic/sve/fp32.cpp +++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp @@ -26,13 +26,13 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/SVEMath.h" +#include #include #include -#include - namespace arm_compute { namespace cpu @@ -58,78 +58,89 @@ void sve_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayer const auto va = svdup_n_f32(act_info.a()); const auto vb = svdup_n_f32(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - svfloat32_t tmp; + svfloat32_t tmp; - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b32(x, window_end_x); - do - { - const auto vin = svld1_f32(pg, input_ptr + x); - switch(act) + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = svabs_f32_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = svmla_f32_z(pg, vb, va, vin); - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin)))); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = svmax_f32_z(pg, const_0, vin); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0)); - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin, svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)))); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1))); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: - tmp = svsqrt_f32_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = svmul_f32_z(pg, vin, vin); - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin))); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = vin; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3))))); - break; - case ActivationLayerInfo::ActivationFunction::SWISH: - tmp = svmul_f32_z(pg, vin, svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin)))))); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - svst1_f32(pg, output_ptr + x, tmp); + const auto vin = svld1_f32(pg, input_ptr + x); + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = svabs_f32_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = svmla_f32_z(pg, vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = svmax_f32_z(pg, const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), + svmax_f32_z(pg, vin, const_0)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin, + svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, + svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = svsqrt_f32_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = svmul_f32_z(pg, vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = svmul_f32_z( + pg, vin, + svmul_f32_z( + pg, const_inv_6, + svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3))))); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = svmul_f32_z( + pg, vin, + svinv_f32_z(pg, svadd_f32_z(pg, const_1, + svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin)))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + svst1_f32(pg, output_ptr + x, tmp); - x += svcntw(); - pg = svwhilelt_b32(x, window_end_x); + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); - } - while(svptest_any(svptrue_b32(), pg)); - }, - input, output); + } while (svptest_any(svptrue_b32(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp index d65de8d649..2ed667debf 100644 --- a/src/cpu/kernels/activation/generic/sve2/lut.cpp +++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/kernels/lut/list.h" namespace arm_compute @@ -33,19 +34,22 @@ namespace cpu #ifdef __aarch64__ void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { - ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED); + ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && + src->info()->data_type() != DataType::QASYMM8_SIGNED); const auto window_end_x = window.x().end(); Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = input.ptr(); - auto output_ptr = output.ptr(); - lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr); - }, - input, output); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = input.ptr(); + auto output_ptr = output.ptr(); + lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr); + }, + input, output); } #endif // __aarch64__ } // namespace cpu diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp index bc9bc7aa3c..7efa9e4b72 100644 --- a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp +++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp @@ -26,18 +26,21 @@ #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" -#include -#include - #include "src/core/NEON/SVEAsymm.h" #include "src/core/NEON/SVEMath.h" + #include +#include +#include namespace arm_compute { namespace cpu { -void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void sve2_qasymm8_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); @@ -61,7 +64,7 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL // Initialise scale/offset for re-quantization bool requant = true; - if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) + if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) { requant = false; } @@ -78,139 +81,160 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL const auto vo_s32 = svdup_n_s32(o_s32); // Initialise scale/offset for re-quantization for leaky relu - int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), - arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), + arm_compute::RoundingPolicy::TO_NEAREST_EVEN); const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - svuint8_t tmp; + svuint8_t tmp; - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do - { - const auto vin = svld1_u8(pg, input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = svmax_u8_z(pg, vconst_0, vin); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin)); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // Perform activation - tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin)); - // Re-quantize to new output space - tmp = svmla_qasymm8_z(pg, tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = svcreate4_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3)))))); - - // Re-quantize to new output space - tmp = svquantize_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = svcreate4_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32)))); - - // Re-quantize to new output space - tmp = svquantize_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - svbool_t p0, p1, p2, p3; - svint32x4_t tmp_dep; - - // Expand to int32 - const svint32x4_t vin_s32 = svcreate4_s32( - svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))), - svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))), - svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))), - svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin)))); - - // Compare elements to input offset - if(qi_in.scale >= 0) + const auto vin = svld1_u8(pg, input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) { - p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + // Perform activation + tmp = svmax_u8_z(pg, vconst_0, vin); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; } - else + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { - p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + // Perform activation + tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; } - - // Multiply negative elements and requantize if necessary - if(requant) + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin)); + // Re-quantize to new output space + tmp = svmla_qasymm8_z(pg, tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3)))))); + + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8)); + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32)))); + + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + svbool_t p0, p1, p2, p3; + svint32x4_t tmp_dep; + + // Expand to int32 + const svint32x4_t vin_s32 = svcreate4_s32(svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))), + svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))), + svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))), + svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin)))); + + // Compare elements to input offset + if (qi_in.scale >= 0) + { + p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + else + { + p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + + // Multiply negative elements and requantize if necessary + if (requant) + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), + svsel(p0, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), + svsel(p1, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), + svsel(p2, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), + svsel(p3, vs_leaky_s32, vs_s32)), + 8)); + } + else + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + } + + // Convert uint32 vectors to uint16 vectors (with saturation) + const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); + const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + + // convert uint16 vectors to uint8 vectors (with saturation) + tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16); } else { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + ARM_COMPUTE_ERROR("Unsupported activation function"); } - // Convert uint32 vectors to uint16 vectors (with saturation) - const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); - const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); - - // convert uint16 vectors to uint8 vectors (with saturation) - tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - - svst1_u8(pg, output_ptr + x, tmp); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); + svst1_u8(pg, output_ptr + x, tmp); - } - while(svptest_any(svptrue_b8(), pg)); + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); - }, - input, output); + } while (svptest_any(svptrue_b8(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp index d20684f54d..e4667522dd 100644 --- a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp +++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp @@ -24,20 +24,23 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" - -#include -#include #include "src/core/NEON/SVEAsymm.h" #include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + #include +#include +#include namespace arm_compute { namespace cpu { -void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void sve2_qasymm8_signed_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); @@ -65,7 +68,7 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti // Initialise scale/offset for re-quantization bool requant = true; - if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) + if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) { requant = false; } @@ -82,151 +85,190 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti const auto vo_s32 = svdup_n_s32(o_s32); // Initialise scale/offset for re-quantization for leaky relu - int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), - arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), + arm_compute::RoundingPolicy::TO_NEAREST_EVEN); const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - svint8_t tmp; + svint8_t tmp; - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do - { - const auto vin = svld1_s8(pg, input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = svmax_s8_z(pg, vconst_0, vin); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin)); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // Perform activation - tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin)); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = svcreate4_f32( - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3)))))); - // Re-quantize to new output space - tmp = svquantize_signed_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = svcreate4_f32( - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32)))); - // Re-quantize to new output space - tmp = svquantize_signed_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = svcreate4_f32( - svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32)))))); - // Re-quantize to new output space - tmp = svquantize_signed_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do { - svbool_t p0, p1, p2, p3; - svint32x4_t tmp_dep; - - // Expand to int32 - const svint32x4_t vin_s32 = svcreate4_s32( - svmovlb_s32(svmovlb_s16(vin)), - svmovlt_s32(svmovlb_s16(vin)), - svmovlb_s32(svmovlt_s16(vin)), - svmovlt_s32(svmovlt_s16(vin))); - - // Compare elements to input offset - if(qi_in.scale >= 0) + const auto vin = svld1_s8(pg, input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) { - p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + // Perform activation + tmp = svmax_s8_z(pg, vconst_0, vin); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; } - else + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { - p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + // Perform activation + tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; } - - // Multiply negative elements and requantize if necessary - if(requant) + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8)); + // Perform activation + tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; } - else + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3)))))); + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32)))); + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svmul_f32_z(pg, svget4_f32(vin_deq, 0), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 0), + const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 1), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 1), + const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 2), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 2), + const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 3), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 3), + const_3_f32)))))); + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + svbool_t p0, p1, p2, p3; + svint32x4_t tmp_dep; - // Convert uint32 vectors to uint16 vectors (with saturation) - const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); - const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + // Expand to int32 + const svint32x4_t vin_s32 = + svcreate4_s32(svmovlb_s32(svmovlb_s16(vin)), svmovlt_s32(svmovlb_s16(vin)), + svmovlb_s32(svmovlt_s16(vin)), svmovlt_s32(svmovlt_s16(vin))); - // convert uint16 vectors to uint8 vectors (with saturation) - tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } + // Compare elements to input offset + if (qi_in.scale >= 0) + { + p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + else + { + p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + + // Multiply negative elements and requantize if necessary + if (requant) + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), + svsel(p0, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), + svsel(p1, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), + svsel(p2, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), + svsel(p3, vs_leaky_s32, vs_s32)), + 8)); + } + else + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + } + + // Convert uint32 vectors to uint16 vectors (with saturation) + const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); + const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + + // convert uint16 vectors to uint8 vectors (with saturation) + tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } - svst1_s8(pg, output_ptr + x, tmp); + svst1_s8(pg, output_ptr + x, tmp); - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - input, output); + } while (svptest_any(svptrue_b8(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp index 5154fac8a7..f955893307 100644 --- a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp +++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp @@ -21,24 +21,27 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" -#include -#include - #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/SVESymm.h" + #include +#include +#include namespace arm_compute { namespace cpu { -void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void sve2_qsymm16_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); @@ -56,62 +59,70 @@ void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationL const auto va_f32 = svdup_n_f32(act_info.a()); const auto vb_f32 = svdup_n_f32(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - svint16_t tmp; + svint16_t tmp; - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do - { - const auto vin = svld1_s16(pg, input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); - // Perform activation - const svfloat32x2_t tmp_dep = svcreate2_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1)))))); - // Re-quantize to new output space - tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); - // Perform activation - const svfloat32x2_t tmp_dep = svcreate2_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32)))); - // Re-quantize to new output space - tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // De-quantize - auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); - // Perform activation - const svfloat32x2_t tmp_dep = svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))), - svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1)))); - // Re-quantize to new output space - tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); - } - else + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } + const auto vin = svld1_s16(pg, input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = svcreate2_f32( + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1)))))); + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = svcreate2_f32( + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32)))); + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = + svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))), + svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1)))); + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } - svst1_s16(pg, output_ptr + x, tmp); + svst1_s16(pg, output_ptr + x, tmp); - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - input, output); + } while (svptest_any(svptrue_b16(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp index fca7b2cd9f..e7679c14e3 100644 --- a/src/cpu/kernels/add/generic/neon/fp16.cpp +++ b/src/cpu/kernels/add/generic/neon/fp16.cpp @@ -30,10 +30,11 @@ namespace arm_compute { namespace cpu { -void add_fp16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_fp16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_neon(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp index 1f599b1968..11a970bef4 100644 --- a/src/cpu/kernels/add/generic/neon/fp32.cpp +++ b/src/cpu/kernels/add/generic/neon/fp32.cpp @@ -28,9 +28,10 @@ namespace arm_compute { namespace cpu { -void add_fp32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_fp32_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_neon(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp index 2dde13544a..34938cc4c4 100644 --- a/src/cpu/kernels/add/generic/neon/impl.cpp +++ b/src/cpu/kernels/add/generic/neon/impl.cpp @@ -23,8 +23,10 @@ */ #include "src/cpu/kernels/add/generic/neon/impl.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -40,7 +42,10 @@ bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo return add_sub_q8_neon_fixedpoint_possible(src0, src1, dst, true); } -bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition) +bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + bool is_addition) { const auto iq0 = src0->quantization_info().uniform(); const auto iq1 = src1->quantization_info().uniform(); @@ -49,7 +54,7 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI const auto scale0 = iq0.scale / oq.scale; const auto scale1 = iq1.scale / oq.scale; - if(scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f) + if (scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f) { // The scale factor cannot be stored as 5.11 signed fixed-point number. return false; @@ -57,9 +62,10 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI const auto offset = float(oq.offset) - scale0 * float(iq0.offset) - scale1 * float(iq1.offset); - const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset)) : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset)); + const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset)) + : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset)); - if(max_acc > 1048575.f) // 2^20 - 1 + if (max_acc > 1048575.f) // 2^20 - 1 { // It might not be possible to store the result as 21.11 signed fixed-point number. return false; @@ -69,13 +75,19 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI } template -void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_q8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_q8_neon_fixedpoint(src0, src1, dst, policy, window, true /*is_addition*/); } template -void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition) +void add_sub_q8_neon_fixedpoint(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition) { ARM_COMPUTE_UNUSED(policy); @@ -103,7 +115,7 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso const auto oq_info = dst->info()->quantization_info().uniform(); const auto in0_scale = iq0_info.scale / oq_info.scale; const auto in1_scale = is_addition ? (iq1_info.scale / oq_info.scale) : (-(iq1_info.scale / oq_info.scale)); - const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset); + const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset); constexpr float _2pow11 = 2048; const auto in0_scale_5p11 = static_cast(support::cpp11::lround(in0_scale * _2pow11)); @@ -112,7 +124,7 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso constexpr uint8_t shift_amount_remainder = 3; - if(is_broadcast_across_x) + if (is_broadcast_across_x) { // Prefix: a = non-broadcast, b = broadcast. @@ -138,68 +150,75 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso Iterator out_it(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto a_ptr = reinterpret_cast(a_input_it.ptr()); - const auto b_ptr = reinterpret_cast(b_input_it.ptr()); - const auto out_ptr = reinterpret_cast(out_it.ptr()); - - const auto b_val = *b_ptr; - const auto b_scaled = b_scale * b_val; - const auto b_scaled_21p11 = static_cast(support::cpp11::lround(b_scaled * _2pow11)); - const auto b_scaled_offseted_21p11 = b_scaled_21p11 + offset_21p11; - const auto b_vscaled_offseted_21p11 = wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag()); + win, + [&](const Coordinates &) + { + const auto a_ptr = reinterpret_cast(a_input_it.ptr()); + const auto b_ptr = reinterpret_cast(b_input_it.ptr()); + const auto out_ptr = reinterpret_cast(out_it.ptr()); + + const auto b_val = *b_ptr; + const auto b_scaled = b_scale * b_val; + const auto b_scaled_21p11 = static_cast(support::cpp11::lround(b_scaled * _2pow11)); + const auto b_scaled_offseted_21p11 = b_scaled_21p11 + offset_21p11; + const auto b_vscaled_offseted_21p11 = + wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag()); #ifndef __aarch64__ - const auto b_scaled_offseted = b_scaled + offset; + const auto b_scaled_offseted = b_scaled + offset; #endif // __aarch64__ - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Load the input. - const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x); - - // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness. - const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0))); - const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0))); - - // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset. - // Widen and store the result in 32-bit integer. - const auto vout_21p11_00 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11); - const auto vout_21p11_01 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11); - const auto vout_21p11_10 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11); - const auto vout_21p11_11 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11); - - // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result. - const auto vout_8p8_0 = wrapper::vcombine( - wrapper::vqrshrn_ex(vout_21p11_00), - wrapper::vqrshrn_ex(vout_21p11_01)); - const auto vout_8p8_1 = wrapper::vcombine( - wrapper::vqrshrn_ex(vout_21p11_10), - wrapper::vqrshrn_ex(vout_21p11_11)); - - // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result. - const auto vout_8p0 = wrapper::vcombine( - wrapper::vqrshrn<8>(vout_8p8_0), - wrapper::vqrshrn<8>(vout_8p8_1)); - - // Store the result. - wrapper::vstore(out_ptr + x, vout_8p0); - } - - // Process the left-over elements. - for(; x < window_end_x; ++x) - { + int x = window_start_x; + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the input. + const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x); + + // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness. + const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0))); + const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0))); + + // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset. + // Widen and store the result in 32-bit integer. + const auto vout_21p11_00 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11); + const auto vout_21p11_01 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11); + const auto vout_21p11_10 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11); + const auto vout_21p11_11 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11); + + // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result. + const auto vout_8p8_0 = + wrapper::vcombine(wrapper::vqrshrn_ex(vout_21p11_00), + wrapper::vqrshrn_ex(vout_21p11_01)); + const auto vout_8p8_1 = + wrapper::vcombine(wrapper::vqrshrn_ex(vout_21p11_10), + wrapper::vqrshrn_ex(vout_21p11_11)); + + // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result. + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1)); + + // Store the result. + wrapper::vstore(out_ptr + x, vout_8p0); + } + + // Process the left-over elements. + for (; x < window_end_x; ++x) + { #ifdef __aarch64__ - out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex(int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11)); + out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex( + int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11)); #else // __aarch64__ - out_ptr[x] = utility::clamp(support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted)); + out_ptr[x] = utility::clamp( + support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted)); #endif // __aarch64__ - } - }, - b_input_it, a_input_it, out_it); + } + }, + b_input_it, a_input_it, out_it); } else { @@ -216,70 +235,85 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso Iterator out_it(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto in0_ptr = reinterpret_cast(in0_it.ptr()); - const auto in1_ptr = reinterpret_cast(in1_it.ptr()); - const auto out_ptr = reinterpret_cast(out_it.ptr()); - - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Load the inputs. - const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x); - const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x); - - // Widen the input elements to signed 16-bit regardless of the input signedness. - const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0))); - const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0))); - const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0))); - const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0))); - - // Multiply the input elements by the scale factor and add the offset. - // Widen and store the result in 32-bit integer. - const auto vscaled0_offseted_21p11_00 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11); - const auto vscaled0_offseted_21p11_01 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11); - const auto vscaled0_offseted_21p11_10 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11); - const auto vscaled0_offseted_21p11_11 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11); - - const auto vout_21p11_00 = wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11); - const auto vout_21p11_01 = wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11); - const auto vout_21p11_10 = wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11); - const auto vout_21p11_11 = wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11); - - // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result. - const auto vout_8p8_0 = wrapper::vcombine( - wrapper::vqrshrn_ex(vout_21p11_00), - wrapper::vqrshrn_ex(vout_21p11_01)); - const auto vout_8p8_1 = wrapper::vcombine( - wrapper::vqrshrn_ex(vout_21p11_10), - wrapper::vqrshrn_ex(vout_21p11_11)); - - // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result. - const auto vout_8p0 = wrapper::vcombine( - wrapper::vqrshrn<8>(vout_8p8_0), - wrapper::vqrshrn<8>(vout_8p8_1)); - - // Store the result. - wrapper::vstore(out_ptr + x, vout_8p0); - } - - // Process the left-over elements. - for(; x < window_end_x; ++x) + win, + [&](const Coordinates &) { + const auto in0_ptr = reinterpret_cast(in0_it.ptr()); + const auto in1_ptr = reinterpret_cast(in1_it.ptr()); + const auto out_ptr = reinterpret_cast(out_it.ptr()); + + int x = window_start_x; + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the inputs. + const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x); + const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x); + + // Widen the input elements to signed 16-bit regardless of the input signedness. + const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0))); + const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0))); + const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0))); + const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0))); + + // Multiply the input elements by the scale factor and add the offset. + // Widen and store the result in 32-bit integer. + const auto vscaled0_offseted_21p11_00 = + wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11); + const auto vscaled0_offseted_21p11_01 = + wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11); + const auto vscaled0_offseted_21p11_10 = + wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11); + const auto vscaled0_offseted_21p11_11 = + wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11); + + const auto vout_21p11_00 = + wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11); + const auto vout_21p11_01 = + wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11); + const auto vout_21p11_10 = + wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11); + const auto vout_21p11_11 = + wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11); + + // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result. + const auto vout_8p8_0 = + wrapper::vcombine(wrapper::vqrshrn_ex(vout_21p11_00), + wrapper::vqrshrn_ex(vout_21p11_01)); + const auto vout_8p8_1 = + wrapper::vcombine(wrapper::vqrshrn_ex(vout_21p11_10), + wrapper::vqrshrn_ex(vout_21p11_11)); + + // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result. + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1)); + + // Store the result. + wrapper::vstore(out_ptr + x, vout_8p0); + } + + // Process the left-over elements. + for (; x < window_end_x; ++x) + { #ifdef __aarch64__ - out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex(int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11)); + out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex( + int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11)); #else // __aarch64__ - out_ptr[x] = utility::clamp(support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset)); + out_ptr[x] = utility::clamp( + support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset)); #endif // __aarch64__ - } - }, - in0_it, in1_it, out_it); + } + }, + in0_it, in1_it, out_it); } } -void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition) +void add_sub_qasymm8_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition) { ARM_COMPUTE_UNUSED(policy); @@ -304,7 +338,7 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale)); const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -324,63 +358,64 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = non_broadcast_input.ptr(); - const auto output_ptr = output.ptr(); - - const auto broadcast_value = *broadcast_input.ptr(); - const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset); - const auto bfs = float(broadcast_value) * bf_scale + offset; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x); + const auto non_broadcast_input_ptr = non_broadcast_input.ptr(); + const auto output_ptr = output.ptr(); + + const auto broadcast_value = *broadcast_input.ptr(); + const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset); + const auto bfs = float(broadcast_value) * bf_scale + offset; - const auto a_u16_0 = vmovl_u8(vget_low_u8(a)); - const auto a_u16_1 = vmovl_u8(vget_high_u8(a)); + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x); - const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1); - const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1); - const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1); - const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1); + const auto a_u16_0 = vmovl_u8(vget_low_u8(a)); + const auto a_u16_1 = vmovl_u8(vget_high_u8(a)); - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; + const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1); + const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1); + const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1); + const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(af_0); - rf_1 = vcvtnq_s32_f32(af_1); - rf_2 = vcvtnq_s32_f32(af_2); - rf_3 = vcvtnq_s32_f32(af_3); + rf_0 = vcvtnq_s32_f32(af_0); + rf_1 = vcvtnq_s32_f32(af_1); + rf_2 = vcvtnq_s32_f32(af_2); + rf_3 = vcvtnq_s32_f32(af_3); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(af_0); - rf_1 = vcvtq_s32_f32(af_1); - rf_2 = vcvtq_s32_f32(af_2); - rf_3 = vcvtq_s32_f32(af_3); + rf_0 = vcvtq_s32_f32(af_0); + rf_1 = vcvtq_s32_f32(af_1); + rf_2 = vcvtq_s32_f32(af_2); + rf_3 = vcvtq_s32_f32(af_3); #endif //__aarch64__ - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); - } + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs; #ifdef __aarch64__ - output_ptr[x] = utility::clamp(support::cpp11::lround(result)); + output_ptr[x] = utility::clamp(support::cpp11::lround(result)); #else // __aarch64__ - output_ptr[x] = utility::clamp(support::cpp11::trunc(result)); + output_ptr[x] = utility::clamp(support::cpp11::trunc(result)); #endif // __aarch64__ - } - }, - broadcast_input, non_broadcast_input, output); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -397,72 +432,78 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst const auto voffset = vdupq_n_f32(offset); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = input1.ptr(); - const auto input2_ptr = input2.ptr(); - const auto output_ptr = output.ptr(); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const uint8x16_t a = vld1q_u8(input1_ptr + x); - const uint8x16_t b = vld1q_u8(input2_ptr + x); - - const auto a_u16_0 = vmovl_u8(vget_low_u8(a)); - const auto a_u16_1 = vmovl_u8(vget_high_u8(a)); - const auto b_u16_0 = vmovl_u8(vget_low_u8(b)); - const auto b_u16_1 = vmovl_u8(vget_high_u8(b)); - - const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1); - const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1); - const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1); - const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1); - - const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2); - const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2); - const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2); - const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; + const auto input1_ptr = input1.ptr(); + const auto input2_ptr = input2.ptr(); + const auto output_ptr = output.ptr(); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t a = vld1q_u8(input1_ptr + x); + const uint8x16_t b = vld1q_u8(input2_ptr + x); + + const auto a_u16_0 = vmovl_u8(vget_low_u8(a)); + const auto a_u16_1 = vmovl_u8(vget_high_u8(a)); + const auto b_u16_0 = vmovl_u8(vget_low_u8(b)); + const auto b_u16_1 = vmovl_u8(vget_high_u8(b)); + + const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1); + const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1); + const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1); + const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1); + + const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2); + const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2); + const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2); + const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(bf_0); - rf_1 = vcvtnq_s32_f32(bf_1); - rf_2 = vcvtnq_s32_f32(bf_2); - rf_3 = vcvtnq_s32_f32(bf_3); + rf_0 = vcvtnq_s32_f32(bf_0); + rf_1 = vcvtnq_s32_f32(bf_1); + rf_2 = vcvtnq_s32_f32(bf_2); + rf_3 = vcvtnq_s32_f32(bf_3); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(bf_0); - rf_1 = vcvtq_s32_f32(bf_1); - rf_2 = vcvtq_s32_f32(bf_2); - rf_3 = vcvtq_s32_f32(bf_3); + rf_0 = vcvtq_s32_f32(bf_0); + rf_1 = vcvtq_s32_f32(bf_1); + rf_2 = vcvtq_s32_f32(bf_2); + rf_3 = vcvtq_s32_f32(bf_3); #endif //__aarch64__ - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); - } + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset; #ifdef __aarch64__ - output_ptr[x] = utility::clamp(support::cpp11::lround(result)); + output_ptr[x] = utility::clamp(support::cpp11::lround(result)); #else // __aarch64__ - output_ptr[x] = utility::clamp(support::cpp11::trunc(result)); + output_ptr[x] = utility::clamp(support::cpp11::trunc(result)); #endif // __aarch64__ - } - }, - input1, input2, output); + } + }, + input1, input2, output); } } -void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition) +void add_sub_qasymm8_signed_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition) { ARM_COMPUTE_UNUSED(policy); @@ -487,7 +528,7 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale)); const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -507,63 +548,64 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset); - const auto bfs = float(broadcast_value) * bf_scale + offset; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const auto a_s16_0 = vmovl_s8(vget_low_s8(a)); - const auto a_s16_1 = vmovl_s8(vget_high_s8(a)); + const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset); + const auto bfs = float(broadcast_value) * bf_scale + offset; - const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1); - const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1); - const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1); - const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1); + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x); - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; + const auto a_s16_0 = vmovl_s8(vget_low_s8(a)); + const auto a_s16_1 = vmovl_s8(vget_high_s8(a)); + + const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1); + const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1); + const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1); + const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(af_0); - rf_1 = vcvtnq_s32_f32(af_1); - rf_2 = vcvtnq_s32_f32(af_2); - rf_3 = vcvtnq_s32_f32(af_3); + rf_0 = vcvtnq_s32_f32(af_0); + rf_1 = vcvtnq_s32_f32(af_1); + rf_2 = vcvtnq_s32_f32(af_2); + rf_3 = vcvtnq_s32_f32(af_3); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(af_0); - rf_1 = vcvtq_s32_f32(af_1); - rf_2 = vcvtq_s32_f32(af_2); - rf_3 = vcvtq_s32_f32(af_3); + rf_0 = vcvtq_s32_f32(af_0); + rf_1 = vcvtq_s32_f32(af_1); + rf_2 = vcvtq_s32_f32(af_2); + rf_3 = vcvtq_s32_f32(af_3); #endif //__aarch64__ - const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); - } + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs; #ifdef __aarch64__ - output_ptr[x] = utility::clamp(support::cpp11::lround(result)); + output_ptr[x] = utility::clamp(support::cpp11::lround(result)); #else // __aarch64__ - output_ptr[x] = utility::clamp(support::cpp11::trunc(result)); + output_ptr[x] = utility::clamp(support::cpp11::trunc(result)); #endif // __aarch64__ - } - }, - broadcast_input, non_broadcast_input, output); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -580,79 +622,102 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens const auto voffset = vdupq_n_f32(offset); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const int8x16_t a = vld1q_s8(input1_ptr + x); - const int8x16_t b = vld1q_s8(input2_ptr + x); - - const auto a_s16_0 = vmovl_s8(vget_low_s8(a)); - const auto a_s16_1 = vmovl_s8(vget_high_s8(a)); - const auto b_s16_0 = vmovl_s8(vget_low_s8(b)); - const auto b_s16_1 = vmovl_s8(vget_high_s8(b)); - - const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1); - const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1); - const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1); - const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1); - - const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2); - const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2); - const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2); - const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t a = vld1q_s8(input1_ptr + x); + const int8x16_t b = vld1q_s8(input2_ptr + x); + + const auto a_s16_0 = vmovl_s8(vget_low_s8(a)); + const auto a_s16_1 = vmovl_s8(vget_high_s8(a)); + const auto b_s16_0 = vmovl_s8(vget_low_s8(b)); + const auto b_s16_1 = vmovl_s8(vget_high_s8(b)); + + const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1); + const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1); + const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1); + const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1); + + const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2); + const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2); + const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2); + const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(bf_0); - rf_1 = vcvtnq_s32_f32(bf_1); - rf_2 = vcvtnq_s32_f32(bf_2); - rf_3 = vcvtnq_s32_f32(bf_3); + rf_0 = vcvtnq_s32_f32(bf_0); + rf_1 = vcvtnq_s32_f32(bf_1); + rf_2 = vcvtnq_s32_f32(bf_2); + rf_3 = vcvtnq_s32_f32(bf_3); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(bf_0); - rf_1 = vcvtq_s32_f32(bf_1); - rf_2 = vcvtq_s32_f32(bf_2); - rf_3 = vcvtq_s32_f32(bf_3); + rf_0 = vcvtq_s32_f32(bf_0); + rf_1 = vcvtq_s32_f32(bf_1); + rf_2 = vcvtq_s32_f32(bf_2); + rf_3 = vcvtq_s32_f32(bf_3); #endif //__aarch64__ - const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); - } + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset; #ifdef __aarch64__ - output_ptr[x] = utility::clamp(support::cpp11::lround(result)); + output_ptr[x] = utility::clamp(support::cpp11::lround(result)); #else // __aarch64__ - output_ptr[x] = utility::clamp(support::cpp11::trunc(result)); + output_ptr[x] = utility::clamp(support::cpp11::trunc(result)); #endif // __aarch64__ - } - }, - input1, input2, output); + } + }, + input1, input2, output); } } -template void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); - -template void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); -template void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); - -void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); -void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); +template void add_q8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_q8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); + +template void add_sub_q8_neon_fixedpoint(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); +template void add_sub_q8_neon_fixedpoint(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); + +void add_sub_qasymm8_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); +void add_sub_qasymm8_signed_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h index fb786c5bc1..faa99baffe 100644 --- a/src/cpu/kernels/add/generic/neon/impl.h +++ b/src/cpu/kernels/add/generic/neon/impl.h @@ -26,8 +26,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/core/Window.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -35,7 +36,8 @@ namespace arm_compute namespace cpu { template -void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_same_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; @@ -53,7 +55,7 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const const auto window_end_x = static_cast(window.x().end()); const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -69,31 +71,36 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v; - } - }, - broadcast_input, non_broadcast_input, output); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + const auto res = (policy == ConvertPolicy::SATURATE) + ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) + : wrapper::vadd(broadcast_value_vec, non_broadcast_v); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) + ? wrapper::add_sat(broadcast_value, non_broadcast_v) + : broadcast_value + non_broadcast_v; + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -106,31 +113,34 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto val1 = wrapper::vloadq(input1_ptr + x); - const auto val2 = wrapper::vloadq(input2_ptr + x); - const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + win, + [&](const Coordinates &) { - const auto val1 = *(input1_ptr + x); - const auto val2 = *(input2_ptr + x); - *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; - } - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = + (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = + (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; + } + }, + input1, input2, output); } } @@ -138,17 +148,36 @@ bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); -bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition); - -void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); - -void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); +bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + bool is_addition); + +void add_sub_qasymm8_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); + +void add_sub_qasymm8_signed_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); template -void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +void add_q8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); template -void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); +void add_sub_q8_neon_fixedpoint(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); } // namespace cpu } // namespace arm_compute #endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp index 5698d6d552..f0bcebc9d2 100644 --- a/src/cpu/kernels/add/generic/neon/integer.cpp +++ b/src/cpu/kernels/add/generic/neon/integer.cpp @@ -28,19 +28,22 @@ namespace arm_compute { namespace cpu { -void add_u8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_u8_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_neon(src0, src1, dst, policy, window); } -void add_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_s16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_neon(src0, src1, dst, policy, window); } -void add_s32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_s32_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_neon(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/qasymm8.cpp b/src/cpu/kernels/add/generic/neon/qasymm8.cpp index 69cca956c8..8195d229d9 100644 --- a/src/cpu/kernels/add/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/add/generic/neon/qasymm8.cpp @@ -23,15 +23,17 @@ */ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + #include "src/cpu/kernels/add/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qasymm8_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_qasymm8_neon(src0, src1, dst, policy, window, true /*is_addition*/); } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp index dfdf8fe85b..7e23096239 100644 --- a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp @@ -23,15 +23,17 @@ */ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + #include "src/cpu/kernels/add/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qasymm8_signed_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, true /*is_addition*/); } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/qsymm16.cpp b/src/cpu/kernels/add/generic/neon/qsymm16.cpp index e76e408d6e..ac2de0557a 100644 --- a/src/cpu/kernels/add/generic/neon/qsymm16.cpp +++ b/src/cpu/kernels/add/generic/neon/qsymm16.cpp @@ -25,14 +25,16 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" namespace arm_compute { namespace cpu { -void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qsymm16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -57,7 +59,7 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -65,7 +67,7 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); // Clear X Dimension on execution window as we handle manually non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -74,48 +76,50 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int16_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); - - const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2); - const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2); - const float bfs = static_cast(broadcast_value) * broadcast_qinfo.scale; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); - const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); - const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const int16_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); + + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2); + const float bfs = static_cast(broadcast_value) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); + rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); + rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); #endif //__aarch64__ - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); - vst1q_s16(output_ptr + x, pa); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info); - } - }, - broadcast_input, non_broadcast_input, output); + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -127,48 +131,50 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator input2(src1, input2_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8_t a = vld1q_s16(input1_ptr + x); - const int16x8_t b = vld1q_s16(input2_ptr + x); - - const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); - const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); - const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2); - const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(input1_ptr + x); + const int16x8_t b = vld1q_s16(input2_ptr + x); + + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); + rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); + rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); #endif //__aarch64__ - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); - vst1q_s16(output_ptr + x, pa); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast((*(input1_ptr + x))) * iq1_info.scale; - const float bfs = static_cast((*(input2_ptr + x))) * iq2_info.scale; - *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info()); - } - }, - input1, input2, output); + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast((*(input1_ptr + x))) * iq1_info.scale; + const float bfs = static_cast((*(input2_ptr + x))) * iq2_info.scale; + *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); } } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve/fp16.cpp b/src/cpu/kernels/add/generic/sve/fp16.cpp index 581f3abded..01dfe6c44b 100644 --- a/src/cpu/kernels/add/generic/sve/fp16.cpp +++ b/src/cpu/kernels/add/generic/sve/fp16.cpp @@ -31,10 +31,11 @@ namespace arm_compute { namespace cpu { -void add_fp16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_fp16_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_sve(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/add/generic/sve/fp32.cpp b/src/cpu/kernels/add/generic/sve/fp32.cpp index b37799113a..56771a5411 100644 --- a/src/cpu/kernels/add/generic/sve/fp32.cpp +++ b/src/cpu/kernels/add/generic/sve/fp32.cpp @@ -24,15 +24,17 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/kernels/add/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void add_fp32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_fp32_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_sve(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve/impl.cpp b/src/cpu/kernels/add/generic/sve/impl.cpp index e8606436fd..ca850fcef4 100644 --- a/src/cpu/kernels/add/generic/sve/impl.cpp +++ b/src/cpu/kernels/add/generic/sve/impl.cpp @@ -23,17 +23,21 @@ */ #include "src/cpu/kernels/add/generic/sve/impl.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include namespace arm_compute { namespace cpu { template -void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_same_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { const auto all_true_pg = wrapper::svptrue(); const auto window_start_x = static_cast(window.x().start()); @@ -53,7 +57,7 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); Iterator output(dst, window); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -68,28 +72,30 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value); + const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value); - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v); - svst1(pg, output_ptr + x, res); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) + : svadd_z(pg, broadcast_value_vec, non_broadcast_v); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -101,35 +107,41 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator input2(src1, input2_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto val1 = svld1(pg, input1_ptr + x); - const auto val2 = svld1(pg, input2_ptr + x); - const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2); - svst1(pg, output_ptr + x, res); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto val1 = svld1(pg, input1_ptr + x); + const auto val2 = svld1(pg, input2_ptr + x); + const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } -template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve/impl.h b/src/cpu/kernels/add/generic/sve/impl.h index 0136f14246..6a95d66826 100644 --- a/src/cpu/kernels/add/generic/sve/impl.h +++ b/src/cpu/kernels/add/generic/sve/impl.h @@ -33,7 +33,8 @@ namespace arm_compute namespace cpu { template -void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +void add_same_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); } // namespace cpu } // namespace arm_compute #endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H diff --git a/src/cpu/kernels/add/generic/sve/integer.cpp b/src/cpu/kernels/add/generic/sve/integer.cpp index 3642dccd7b..4d17f2adbd 100644 --- a/src/cpu/kernels/add/generic/sve/integer.cpp +++ b/src/cpu/kernels/add/generic/sve/integer.cpp @@ -24,25 +24,29 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/kernels/add/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void add_u8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_u8_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_sve(src0, src1, dst, policy, window); } -void add_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_s16_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_sve(src0, src1, dst, policy, window); } -void add_s32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_s32_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_sve(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp index 1dec214aa0..40add9d51b 100644 --- a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp +++ b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp @@ -26,15 +26,18 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include namespace arm_compute { namespace cpu { -void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qasymm8_sve2( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -58,7 +61,7 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); const auto voffseto = svdup_n_f32(oq_info.offset); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -78,48 +81,89 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value); + const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value); - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2); + const auto bf_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), + voffset2)), + vscale2); + const auto bf_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), + voffset2)), + vscale2); + const auto bf_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), + voffset2)), + vscale2); + const auto bf_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), + voffset2)), + vscale2); - do - { - const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x); - - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1); - - const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); - const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); - - const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); - svst1_u8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + do + { + const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x); + + const auto af_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), + vscale1); + const auto af_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), + vscale1); + + const auto rf_0 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); + const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); + + const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); + svst1_u8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -136,45 +180,82 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co const auto voffset1 = svdup_n_s32(iq1_info.offset); const auto voffset2 = svdup_n_s32(iq2_info.offset); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto a = svld1_u8(pg, input1_ptr + x); - const auto b = svld1_u8(pg, input2_ptr + x); - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1); - - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2); - - const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); - const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); - const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); - - svst1_u8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto a = svld1_u8(pg, input1_ptr + x); + const auto b = svld1_u8(pg, input2_ptr + x); + const auto af_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), + vscale1); + const auto af_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), + vscale1); + + const auto bf_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), + vscale2); + const auto bf_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), + vscale2); + const auto bf_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), + vscale2); + const auto bf_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), + vscale2); + + const auto rf_0 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); + const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); + const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); + + svst1_u8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } } // namespace cpu diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp index dae8899753..2e585115e1 100644 --- a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp +++ b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp @@ -26,15 +26,18 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include namespace arm_compute { namespace cpu { -void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qasymm8_signed_sve2( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -57,7 +60,7 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor * const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); const auto voffseto = svdup_n_f32(oq_info.offset); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -78,46 +81,63 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor * Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = svdup_n_s8(broadcast_value); - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2); - - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto a = svld1_s8(pg, non_broadcast_input_ptr + x); - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); - - const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); - const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); - - svst1_s8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = svdup_n_s8(broadcast_value); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + const auto bf_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), + vscale2); + const auto bf_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), + vscale2); + const auto bf_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), + vscale2); + const auto bf_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), + vscale2); + + do + { + const auto a = svld1_s8(pg, non_broadcast_input_ptr + x); + const auto af_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); + const auto af_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); + + const auto rf_0 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); + const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); + + svst1_s8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -134,46 +154,59 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor * const auto voffset1 = svdup_n_s32(iq1_info.offset); const auto voffset2 = svdup_n_s32(iq2_info.offset); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto a = svld1_s8(pg, input1_ptr + x); - const auto b = svld1_s8(pg, input2_ptr + x); - - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); - - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2); - - const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); - const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); - - svst1_s8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto a = svld1_s8(pg, input1_ptr + x); + const auto b = svld1_s8(pg, input2_ptr + x); + + const auto af_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); + const auto af_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); + + const auto bf_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2); + const auto bf_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2); + const auto bf_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2); + const auto bf_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2); + + const auto rf_0 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); + const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); + + svst1_s8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + input1, input2, output); } } } // namespace cpu diff --git a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp index 8c48ded942..17a42c2138 100644 --- a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp +++ b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp @@ -26,15 +26,18 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include namespace arm_compute { namespace cpu { -void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qsymm16_sve2( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -59,7 +62,7 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); const auto all_true_pg = svptrue_b16(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -74,39 +77,40 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const int16_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = svdup_n_s16(broadcast_value); + const int16_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = svdup_n_s16(broadcast_value); - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2); + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2); - do - { - const auto a = svld1_s16(pg, non_broadcast_input_ptr + x); - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); + do + { + const auto a = svld1_s16(pg, non_broadcast_input_ptr + x); + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); - const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - svst1_s16(pg, output_ptr + x, res); + svst1_s16(pg, output_ptr + x, res); - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -118,37 +122,38 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator input2(src1, input2_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - auto a = svld1_s16(pg, input1_ptr + x); - auto b = svld1_s16(pg, input2_ptr + x); - - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); - - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2); - - const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - - const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - svst1_s16(pg, output_ptr + x, res); - - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + auto a = svld1_s16(pg, input1_ptr + x); + auto b = svld1_s16(pg, input2_ptr + x); + + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); + + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2); + + const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + + const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + svst1_s16(pg, output_ptr + x, res); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } } // namespace cpu diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h index 7cdb70fd9e..1040c39a41 100644 --- a/src/cpu/kernels/add/list.h +++ b/src/cpu/kernels/add/list.h @@ -31,8 +31,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_ADD_KERNEL(func_name) \ - void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +#define DECLARE_ADD_KERNEL(func_name) \ + void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \ + const Window &window) DECLARE_ADD_KERNEL(add_qasymm8_neon); DECLARE_ADD_KERNEL(add_qasymm8_signed_neon); @@ -55,4 +56,4 @@ DECLARE_ADD_KERNEL(add_qsymm16_sve2); } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_KERNELS_ADD_LIST_H \ No newline at end of file +#endif // SRC_CORE_KERNELS_ADD_LIST_H diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp index d8e5f694a8..b4b81aa78b 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/CpuTypes.h" #include @@ -38,16 +39,20 @@ namespace { using arm_compute::float16_t; -void a64_add_bn_clamp_direct_fp16_2x32( - float16_t *out, size_t out_stride, - float16_t *out_direct, size_t out_direct_stride, - const float16_t *in0, size_t in0_stride, - const float16_t *in1, size_t in1_stride, - const float16_t *bn_mul, - const float16_t *bn_add, - const float16_t minval, - const float16_t maxval, - size_t width, size_t height) +void a64_add_bn_clamp_direct_fp16_2x32(float16_t *out, + size_t out_stride, + float16_t *out_direct, + size_t out_direct_stride, + const float16_t *in0, + size_t in0_stride, + const float16_t *in1, + size_t in1_stride, + const float16_t *bn_mul, + const float16_t *bn_add, + const float16_t minval, + const float16_t maxval, + size_t width, + size_t height) { struct KernelArgs { @@ -858,9 +863,14 @@ void a64_add_bn_clamp_direct_fp16_2x32( "subs x20, x20, #0x2\n" "bgt 8b\n" "58:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } } // namespace @@ -869,8 +879,15 @@ namespace arm_compute { namespace cpu { -void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_fp16_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -882,16 +899,16 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I float16_t minval = std::numeric_limits::lowest(); float16_t maxval = std::numeric_limits::max(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = static_cast(0.f); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = static_cast(0.f); maxval = static_cast(act_info.a()); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = static_cast(act_info.b()); maxval = static_cast(act_info.a()); @@ -909,42 +926,37 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp16_2x32( - reinterpret_cast(out_it.ptr()), out_stride, - reinterpret_cast(add_out_it.ptr()), out_direct_stride, - reinterpret_cast(in1_it.ptr()), in0_stride, - reinterpret_cast(in2_it.ptr()), in1_stride, - reinterpret_cast(bn_mul->buffer()), - reinterpret_cast(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast(out_it.ptr()), out_stride, + reinterpret_cast(add_out_it.ptr()), out_direct_stride, + reinterpret_cast(in1_it.ptr()), in0_stride, + reinterpret_cast(in2_it.ptr()), in1_stride, + reinterpret_cast(bn_mul->buffer()), + reinterpret_cast(bn_add->buffer()), minval, maxval, + width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp16_2x32( - reinterpret_cast(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast(in1_it.ptr()), in0_stride, - reinterpret_cast(in2_it.ptr()), in1_stride, - reinterpret_cast(bn_mul->buffer()), - reinterpret_cast(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast(out_it.ptr()), out_stride, nullptr, + out_direct_stride, reinterpret_cast(in1_it.ptr()), + in0_stride, reinterpret_cast(in2_it.ptr()), in1_stride, + reinterpret_cast(bn_mul->buffer()), + reinterpret_cast(bn_add->buffer()), minval, maxval, + width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp index b0c487ec56..f0444b6acd 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp @@ -35,16 +35,20 @@ #ifdef __aarch64__ namespace { -void a64_add_bn_clamp_direct_fp32_2x16( - float *out, size_t out_stride, - float *out_direct, size_t out_direct_stride, - const float *in0, size_t in0_stride, - const float *in1, size_t in1_stride, - const float *bn_mul, - const float *bn_add, - const float minval, - const float maxval, - size_t width, size_t height) +void a64_add_bn_clamp_direct_fp32_2x16(float *out, + size_t out_stride, + float *out_direct, + size_t out_direct_stride, + const float *in0, + size_t in0_stride, + const float *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const float minval, + const float maxval, + size_t width, + size_t height) { struct KernelArgs { @@ -631,18 +635,30 @@ void a64_add_bn_clamp_direct_fp32_2x16( "subs x20, x20, #0x2\n" "bgt 8b\n" "34:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); -} + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } +} // namespace namespace arm_compute { namespace cpu { -void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_fp32_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -654,16 +670,16 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I float minval = std::numeric_limits::lowest(); float maxval = std::numeric_limits::max(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = 0.f; } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = 0.f; maxval = act_info.a(); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = act_info.b(); maxval = act_info.a(); @@ -681,42 +697,34 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp32_2x16( - reinterpret_cast(out_it.ptr()), out_stride, - reinterpret_cast(add_out_it.ptr()), out_direct_stride, - reinterpret_cast(in1_it.ptr()), in0_stride, - reinterpret_cast(in2_it.ptr()), in1_stride, - reinterpret_cast(bn_mul->buffer()), - reinterpret_cast(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp32_2x16( + reinterpret_cast(out_it.ptr()), out_stride, reinterpret_cast(add_out_it.ptr()), + out_direct_stride, reinterpret_cast(in1_it.ptr()), in0_stride, + reinterpret_cast(in2_it.ptr()), in1_stride, reinterpret_cast(bn_mul->buffer()), + reinterpret_cast(bn_add->buffer()), minval, maxval, width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp32_2x16( - reinterpret_cast(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast(in1_it.ptr()), in0_stride, - reinterpret_cast(in2_it.ptr()), in1_stride, - reinterpret_cast(bn_mul->buffer()), - reinterpret_cast(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp32_2x16( + reinterpret_cast(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast(in1_it.ptr()), in0_stride, reinterpret_cast(in2_it.ptr()), + in1_stride, reinterpret_cast(bn_mul->buffer()), + reinterpret_cast(bn_add->buffer()), minval, maxval, width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp index f7448a6717..035805c944 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp @@ -36,22 +36,30 @@ #ifdef __aarch64__ namespace { -void a64_add_bn_clamp_direct_u8_fp32_2x16( - uint8_t *out, size_t out_stride, - uint8_t *out_direct, size_t out_direct_stride, - const uint8_t *in0, size_t in0_stride, - const uint8_t *in1, size_t in1_stride, - const float *bn_mul, - const float *bn_add, - const uint8_t minval, - const uint8_t maxval, - int32_t out_zeropt, float out_scale, - int32_t out_direct_zeropt, float out_direct_scale, - int32_t in0_zeropt, float in0_scale, - int32_t in1_zeropt, float in1_scale, - size_t width, size_t height) +void a64_add_bn_clamp_direct_u8_fp32_2x16(uint8_t *out, + size_t out_stride, + uint8_t *out_direct, + size_t out_direct_stride, + const uint8_t *in0, + size_t in0_stride, + const uint8_t *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const uint8_t minval, + const uint8_t maxval, + int32_t out_zeropt, + float out_scale, + int32_t out_direct_zeropt, + float out_direct_scale, + int32_t in0_zeropt, + float in0_scale, + int32_t in1_zeropt, + float in1_scale, + size_t width, + size_t height) { - float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale }; + float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale}; struct KernelArgs { const float *scales; @@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_u8_fp32_2x16( "subs x23, x23, #0x2\n" "bgt 6b\n" "32:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), + [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), + [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), + [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), + [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } } // namespace @@ -720,8 +738,15 @@ namespace arm_compute { namespace cpu { -void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_u8_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -739,24 +764,25 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe uint8_t maxval = std::numeric_limits::max(); const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = quantize_qasymm8(0.f, final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = quantize_qasymm8(0.f, final_output_qinfo); maxval = quantize_qasymm8(act_info.a(), final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = quantize_qasymm8(act_info.b(), final_output_qinfo); maxval = quantize_qasymm8(act_info.a(), final_output_qinfo); } - const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); - const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); - const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); + const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); + const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); + const UniformQuantizationInfo add_output_qinfo = + (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); const int32_t in1_offset = in1_qinfo.offset; const int32_t in2_offset = in2_qinfo.offset; @@ -783,50 +809,35 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_u8_fp32_2x16( - reinterpret_cast(out_it.ptr()), out_stride, - reinterpret_cast(add_out_it.ptr()), out_direct_stride, - reinterpret_cast(in1_it.ptr()), in0_stride, - reinterpret_cast(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_u8_fp32_2x16( + reinterpret_cast(out_it.ptr()), out_stride, + reinterpret_cast(add_out_it.ptr()), out_direct_stride, + reinterpret_cast(in1_it.ptr()), in0_stride, reinterpret_cast(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_u8_fp32_2x16( - reinterpret_cast(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast(in1_it.ptr()), in0_stride, - reinterpret_cast(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_u8_fp32_2x16( + reinterpret_cast(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast(in1_it.ptr()), in0_stride, reinterpret_cast(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp index 1ae2cb76a9..e1a45b467b 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp @@ -36,22 +36,30 @@ #ifdef __aarch64__ namespace { -void a64_add_bn_clamp_direct_s8_fp32_2x16( - int8_t *out, size_t out_stride, - int8_t *out_direct, size_t out_direct_stride, - const int8_t *in0, size_t in0_stride, - const int8_t *in1, size_t in1_stride, - const float *bn_mul, - const float *bn_add, - const int8_t minval, - const int8_t maxval, - int32_t out_zeropt, float out_scale, - int32_t out_direct_zeropt, float out_direct_scale, - int32_t in0_zeropt, float in0_scale, - int32_t in1_zeropt, float in1_scale, - size_t width, size_t height) +void a64_add_bn_clamp_direct_s8_fp32_2x16(int8_t *out, + size_t out_stride, + int8_t *out_direct, + size_t out_direct_stride, + const int8_t *in0, + size_t in0_stride, + const int8_t *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const int8_t minval, + const int8_t maxval, + int32_t out_zeropt, + float out_scale, + int32_t out_direct_zeropt, + float out_direct_scale, + int32_t in0_zeropt, + float in0_scale, + int32_t in1_zeropt, + float in1_scale, + size_t width, + size_t height) { - float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale }; + float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale}; struct KernelArgs { const float *scales; @@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_s8_fp32_2x16( "subs x23, x23, #0x2\n" "bgt 6b\n" "32:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), + [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), + [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), + [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), + [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } } // namespace @@ -720,8 +738,15 @@ namespace arm_compute { namespace cpu { -void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_s8_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -739,24 +764,25 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe int8_t maxval = std::numeric_limits::max(); const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = quantize_qasymm8_signed(0.f, final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = quantize_qasymm8_signed(0.f, final_output_qinfo); maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo); maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo); } - const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); - const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); - const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); + const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); + const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); + const UniformQuantizationInfo add_output_qinfo = + (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); const int32_t in1_offset = in1_qinfo.offset; const int32_t in2_offset = in2_qinfo.offset; @@ -783,50 +809,35 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_s8_fp32_2x16( - reinterpret_cast(out_it.ptr()), out_stride, - reinterpret_cast(add_out_it.ptr()), out_direct_stride, - reinterpret_cast(in1_it.ptr()), in0_stride, - reinterpret_cast(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_s8_fp32_2x16( + reinterpret_cast(out_it.ptr()), out_stride, reinterpret_cast(add_out_it.ptr()), + out_direct_stride, reinterpret_cast(in1_it.ptr()), in0_stride, + reinterpret_cast(in2_it.ptr()), in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, + out_offset, out_scale, out_direct_offset, out_direct_scale, in1_offset, in1_scale, in2_offset, + in2_scale, width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_s8_fp32_2x16( - reinterpret_cast(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast(in1_it.ptr()), in0_stride, - reinterpret_cast(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_s8_fp32_2x16( + reinterpret_cast(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast(in1_it.ptr()), in0_stride, reinterpret_cast(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/list.h b/src/cpu/kernels/addmuladd/list.h index a7c22c06d8..568003a916 100644 --- a/src/cpu/kernels/addmuladd/list.h +++ b/src/cpu/kernels/addmuladd/list.h @@ -32,9 +32,10 @@ namespace arm_compute { namespace cpu { -#define DECLARE_ADD_MUL_ADD_KERNEL(func_name) \ +#define DECLARE_ADD_MUL_ADD_KERNEL(func_name) \ void func_name(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, \ - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) + ITensor *add_output, ITensor *final_output, ConvertPolicy policy, \ + const ActivationLayerInfo &act_info, const Window &window) DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp32_neon); DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp16_neon); diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h index 10bf8e4ff7..6e8f32ef47 100644 --- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h +++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/core/NEON/INEKernel.h" #include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp" @@ -57,13 +58,12 @@ class CpuGemmAssemblyWrapperKernel final : public INEKernel public: /** Constructor */ - CpuGemmAssemblyWrapperKernel() - : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel") + CpuGemmAssemblyWrapperKernel() : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel") { } - CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete; - CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default; + CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete; + CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default; CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete; const char *name() const override @@ -110,7 +110,7 @@ public: INEKernel::configure(win); - if(!kernel_name_tag.empty()) + if (!kernel_name_tag.empty()) { _name += "/" + kernel_name_tag; } @@ -132,7 +132,7 @@ public: private: arm_gemm::GemmCommon *_kernel; - std::string _name; + std::string _name; }; } // namespace kernel } // namespace cpu diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp index 4c127b4ec3..9a913c5c58 100644 --- a/src/cpu/kernels/assembly/arm_gemm.hpp +++ b/src/cpu/kernels/assembly/arm_gemm.hpp @@ -23,13 +23,12 @@ */ #pragma once +#include "arm_gemm_local.hpp" +#include "gemm_common.hpp" #include #include #include -#include "arm_gemm_local.hpp" -#include "gemm_common.hpp" - namespace arm_gemm { enum class GemmMethod @@ -111,8 +110,7 @@ struct GemmConfig unsigned int outer_block_size = 0; WeightFormat weight_format = WeightFormat::ANY; - GemmConfig(GemmMethod method) - : method(method) + GemmConfig(GemmMethod method) : method(method) { } GemmConfig() @@ -133,8 +131,7 @@ struct Activation float param1; float param2; - Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) - : type(type), param1(p1), param2(p2) + Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) : type(type), param1(p1), param2(p2) { } }; @@ -156,12 +153,32 @@ public: bool _fast_mode; const GemmConfig *_cfg; - GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N, - unsigned int K, unsigned int Ksections, unsigned int nbatches, - unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads, - bool fixed_format = false, bool fast_mode = false, const GemmConfig *cfg = nullptr) - : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), - _fixed_format(fixed_format), _fast_mode(fast_mode), _cfg(cfg) + GemmArgs(const CPUInfo *ci, + unsigned int M, + unsigned int N, + unsigned int K, + unsigned int Ksections, + unsigned int nbatches, + unsigned int nmulti, + bool indirect_input, + Activation act, + const int maxthreads, + bool fixed_format = false, + bool fast_mode = false, + const GemmConfig *cfg = nullptr) + : _ci(ci), + _Msize(M), + _Nsize(N), + _Ksize(K), + _Ksections(Ksections), + _nbatches(nbatches), + _nmulti(nmulti), + _indirect_input(indirect_input), + _act(act), + _maxthreads(maxthreads), + _fixed_format(fixed_format), + _fast_mode(fast_mode), + _cfg(cfg) { } }; @@ -187,23 +204,51 @@ public: Requantize32() = default; // Constructor for per-tensor quantization - Requantize32(const int32_t *bias, size_t bias_multi_stride, - int32_t a_offset, int32_t b_offset, int32_t c_offset, - int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv) - : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max(requant_shift, 0)), - per_layer_right_shift(std::min(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv) + Requantize32(const int32_t *bias, + size_t bias_multi_stride, + int32_t a_offset, + int32_t b_offset, + int32_t c_offset, + int32_t requant_shift, + int32_t requant_mul, + int32_t minv, + int32_t maxv) + : bias(bias), + bias_multi_stride(bias_multi_stride), + a_offset(a_offset), + b_offset(b_offset), + c_offset(c_offset), + per_channel_requant(false), + per_layer_left_shift(std::max(requant_shift, 0)), + per_layer_right_shift(std::min(requant_shift, 0)), + per_layer_mul(requant_mul), + minval(minv), + maxval(maxv) { } // Constructor for per-channel quantization - Requantize32(const int32_t *bias, size_t bias_multi_stride, - int32_t a_offset, int32_t b_offset, int32_t c_offset, + Requantize32(const int32_t *bias, + size_t bias_multi_stride, + int32_t a_offset, + int32_t b_offset, + int32_t c_offset, const int32_t *requant_left_shifts, const int32_t *requant_right_shifts, const int32_t *requant_muls, - int32_t minv, int32_t maxv) - : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts), - per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv) + int32_t minv, + int32_t maxv) + : bias(bias), + bias_multi_stride(bias_multi_stride), + a_offset(a_offset), + b_offset(b_offset), + c_offset(c_offset), + per_channel_requant(true), + per_channel_left_shifts(requant_left_shifts), + per_channel_right_shifts(requant_right_shifts), + per_channel_muls(requant_muls), + minval(minv), + maxval(maxv) { } }; diff --git a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp index 718fcd1fb4..0672e899b6 100644 --- a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp +++ b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp @@ -27,7 +27,6 @@ #include "arm_compute/core/Window.h" #include "ndrange.hpp" - #include /* This file contains mapping between integral types used in arm_compute and arm_gemm @@ -38,8 +37,7 @@ namespace arm_gemm { //we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library -constexpr std::size_t ndrange_max = - arm_compute::Dimensions::num_max_dimensions; +constexpr std::size_t ndrange_max = arm_compute::Dimensions::num_max_dimensions; using ndrange_t = NDRange; using ndcoord_t = NDCoordinate; @@ -56,7 +54,7 @@ inline arm_compute::Window to_window(const ndrange_t &ndr) { arm_compute::Window win; - for(unsigned int i = 0; i != ndrange_max; ++i) + for (unsigned int i = 0; i != ndrange_max; ++i) { //populate the window with the dimensions of the NDRange win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i))); @@ -75,7 +73,7 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc) { arm_compute::Window win; - for(unsigned int i = 0; i != ndrange_max; ++i) + for (unsigned int i = 0; i != ndrange_max; ++i) { const auto start = ndc.get_position(i); const auto size = ndc.get_size(i); @@ -98,15 +96,12 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc) */ inline ndrange_t to_ndrange(const arm_compute::Window &win) { - return - { - static_cast(win[0].end() - win[0].start()), - static_cast(win[1].end() - win[1].start()), - static_cast(win[2].end() - win[2].start()), - static_cast(win[3].end() - win[3].start()), - static_cast(win[4].end() - win[4].start()), - static_cast(win[5].end() - win[5].start()) - }; + return {static_cast(win[0].end() - win[0].start()), + static_cast(win[1].end() - win[1].start()), + static_cast(win[2].end() - win[2].start()), + static_cast(win[3].end() - win[3].start()), + static_cast(win[4].end() - win[4].start()), + static_cast(win[5].end() - win[5].start())}; } /** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions @@ -116,15 +111,12 @@ inline ndrange_t to_ndrange(const arm_compute::Window &win) */ inline ndcoord_t to_ndcoord(const arm_compute::Window &win) { - return - { - { static_cast(win[0].start()), static_cast(win[0].end() - win[0].start()) }, - { static_cast(win[1].start()), static_cast(win[1].end() - win[1].start()) }, - { static_cast(win[2].start()), static_cast(win[2].end() - win[2].start()) }, - { static_cast(win[3].start()), static_cast(win[3].end() - win[3].start()) }, - { static_cast(win[4].start()), static_cast(win[4].end() - win[4].start()) }, - { static_cast(win[5].start()), static_cast(win[5].end() - win[5].start()) } - }; + return {{static_cast(win[0].start()), static_cast(win[0].end() - win[0].start())}, + {static_cast(win[1].start()), static_cast(win[1].end() - win[1].start())}, + {static_cast(win[2].start()), static_cast(win[2].end() - win[2].start())}, + {static_cast(win[3].start()), static_cast(win[3].end() - win[3].start())}, + {static_cast(win[4].start()), static_cast(win[4].end() - win[4].start())}, + {static_cast(win[5].start()), static_cast(win[5].end() - win[5].start())}}; } } //namespace arm_gemm diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp index 834cd1061e..6fe9f13f02 100644 --- a/src/cpu/kernels/assembly/gemm_common.hpp +++ b/src/cpu/kernels/assembly/gemm_common.hpp @@ -25,7 +25,6 @@ #include "convolution_parameters.hpp" #include "ndrange.hpp" - #include namespace arm_gemm @@ -51,10 +50,19 @@ public: * appropriately typed pointers. If B is pretransposed (see below) then * the settings for B here are ignored. */ - virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0; + virtual void set_arrays_generic(const void *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const void *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + void *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const void *bias, + /* no row or batch stride needed */ const int bias_multi_stride) = 0; /** @returns an ndrange containing ranges of the compute space which can be * broken up and parallelised over @@ -73,7 +81,7 @@ public: * This has an empty default implementation, as GEMMs which don't care * about thread count can safely ignore this. */ - virtual void set_nthreads(int) {}; + virtual void set_nthreads(int){}; /* Whether this GEMM can be dynamically scheduled or not. */ virtual bool supports_dynamic_scheduling() const @@ -95,7 +103,7 @@ public: return 0; } /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */ - virtual void set_working_space(void *) {}; + virtual void set_working_space(void *){}; /*** "Pretransposed" interface (optional) ***/ /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */ @@ -122,7 +130,8 @@ public: /* The "real" version of this depends on the templated operand type (see below). */ virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0; /* Threaded version with window start/end parameters */ - virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0; + virtual void + pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0; /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */ virtual void set_pretransposed_B_data(void *) @@ -186,10 +195,19 @@ protected: public: /* Pass in the pointers to the arrays to be operated on and their * strides (templated version with appropriate types). */ - virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const To *B, const int ldb, /* batches share B */ const int B_multi_stride, - Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride) + virtual void set_arrays(const To *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const To *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + Tr *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const Tr *bias, + /* no row or batch stride needed */ const int bias_multi_stride) { _Aptr = A; _lda = lda; @@ -207,25 +225,33 @@ public: } /* Implementation of the void * overload which casts its arguments to the appropriate type. */ - void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override + void set_arrays_generic(const void *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const void *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + void *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const void *bias, + /* no row or batch stride needed */ const int bias_multi_stride) override { - set_arrays(static_cast(A), lda, A_batch_stride, A_multi_stride, - static_cast(B), ldb, B_multi_stride, - static_cast(C), ldc, C_batch_stride, C_multi_stride, + set_arrays(static_cast(A), lda, A_batch_stride, A_multi_stride, static_cast(B), ldb, + B_multi_stride, static_cast(C), ldc, C_batch_stride, C_multi_stride, static_cast(bias), bias_multi_stride); } /*** "Pretransposed" interface ***/ /* Compute col sums over all columns */ - virtual void requantize_bias(void *, const To *, const int, const int) {}; + virtual void requantize_bias(void *, const To *, const int, const int){}; /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ - virtual void pretranspose_B_array(void *, const To *, const int, const int) {}; + virtual void pretranspose_B_array(void *, const To *, const int, const int){}; /* Implementation of the void * overload which casts its arguments to the appropriate type. */ void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override @@ -237,12 +263,14 @@ public: * The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and * just calls the non-threaded functions to do the work. This is valid as with window size of 1 the only * legal values for start and end are 0 and 1 respectively. */ - virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t) + virtual void + pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t) { pretranspose_B_array(out, in, row_stride, multi_stride); }; - void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override + void pretranspose_B_array_part_generic( + void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override { pretranspose_B_array_part(out, static_cast(in), row_stride, multi_stride, start, end); } diff --git a/src/cpu/kernels/assembly/ndrange.hpp b/src/cpu/kernels/assembly/ndrange.hpp index 1c8261aef7..baccdc0d88 100644 --- a/src/cpu/kernels/assembly/ndrange.hpp +++ b/src/cpu/kernels/assembly/ndrange.hpp @@ -45,8 +45,7 @@ private: unsigned int m_end = 0; public: - NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) - : m_parent(p), m_pos(s), m_end(e) + NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { } @@ -59,12 +58,12 @@ private: { unsigned int r = m_pos; - if(d < (D - 1)) + if (d < (D - 1)) { r %= m_parent.m_totalsizes[d]; } - if(d > 0) + if (d > 0) { r /= m_parent.m_totalsizes[d - 1]; } @@ -98,9 +97,9 @@ private: { unsigned int t = 1; - for(unsigned int i = 0; i < D; i++) + for (unsigned int i = 0; i < D; i++) { - if(m_sizes[i] == 0) + if (m_sizes[i] == 0) { m_sizes[i] = 1; } @@ -116,14 +115,12 @@ public: NDRange(const NDRange &rhs) = default; template - NDRange(T... ts) - : m_sizes{ ts... } + NDRange(T... ts) : m_sizes{ts...} { set_totalsizes(); } - NDRange(const std::array &n) - : m_sizes(n) + NDRange(const std::array &n) : m_sizes(n) { set_totalsizes(); } @@ -163,7 +160,7 @@ public: std::array sizes{}; std::size_t i = 0; - for(auto &p : list) + for (auto &p : list) { m_positions[i] = p.first; sizes[i++] = p.second; diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp index 5661479059..dbdec5fb50 100644 --- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp @@ -29,7 +29,11 @@ namespace arm_compute { namespace cpu { -void neon_fp16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +void neon_fp16_boundingboxtransform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) { return bounding_box_transform(boxes, pred_boxes, deltas, bbinfo, window); } diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp index 34ff9224d5..0224b3406a 100644 --- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp @@ -26,7 +26,11 @@ namespace arm_compute { namespace cpu { -void neon_fp32_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +void neon_fp32_boundingboxtransform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) { return bounding_box_transform(boxes, pred_boxes, deltas, bbinfo, window); } diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp index b3ffd0a676..5a2939b587 100644 --- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp @@ -29,7 +29,11 @@ namespace arm_compute { namespace cpu { -void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +void bounding_box_transform_qsymm16(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) { const size_t num_classes = deltas->info()->tensor_shape()[0] >> 2; @@ -41,7 +45,8 @@ void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, c const auto scale_before = bbinfo.scale(); const auto offset = (bbinfo.correct_transform_coords() ? 1.f : 0.f); - auto pred_ptr = reinterpret_cast(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes()); + auto pred_ptr = + reinterpret_cast(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes()); auto delta_ptr = reinterpret_cast(deltas->buffer() + deltas->info()->offset_first_element_in_bytes()); const auto boxes_qinfo = boxes->info()->quantization_info().uniform(); @@ -49,41 +54,49 @@ void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, c const auto pred_qinfo = pred_boxes->info()->quantization_info().uniform(); Iterator box_it(boxes, window); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto ptr = reinterpret_cast(box_it.ptr()); - const auto b0 = dequantize_qasymm16(*ptr, boxes_qinfo); - const auto b1 = dequantize_qasymm16(*(ptr + 1), boxes_qinfo); - const auto b2 = dequantize_qasymm16(*(ptr + 2), boxes_qinfo); - const auto b3 = dequantize_qasymm16(*(ptr + 3), boxes_qinfo); - const float width = (b2 / scale_before) - (b0 / scale_before) + 1.f; - const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f; - const float ctr_x = (b0 / scale_before) + 0.5f * width; - const float ctr_y = (b1 / scale_before) + 0.5f * height; - for(size_t j = 0; j < num_classes; ++j) + execute_window_loop( + window, + [&](const Coordinates &id) { - // Extract deltas - const size_t delta_id = id.y() * deltas_width + 4u * j; - const float dx = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0]; - const float dy = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1]; - float dw = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2]; - float dh = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3]; - // Clip dw and dh - dw = std::min(dw, bbinfo.bbox_xform_clip()); - dh = std::min(dh, bbinfo.bbox_xform_clip()); - // Determine the predictions - const float pred_ctr_x = dx * width + ctr_x; - const float pred_ctr_y = dy * height + ctr_y; - const float pred_w = std::exp(dw) * width; - const float pred_h = std::exp(dh) * height; - // Store the prediction into the output tensor - pred_ptr[delta_id] = quantize_qasymm16(scale_after * utility::clamp(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo); - pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo); - pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo); - pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo); - } - }, - box_it); + const auto ptr = reinterpret_cast(box_it.ptr()); + const auto b0 = dequantize_qasymm16(*ptr, boxes_qinfo); + const auto b1 = dequantize_qasymm16(*(ptr + 1), boxes_qinfo); + const auto b2 = dequantize_qasymm16(*(ptr + 2), boxes_qinfo); + const auto b3 = dequantize_qasymm16(*(ptr + 3), boxes_qinfo); + const float width = (b2 / scale_before) - (b0 / scale_before) + 1.f; + const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f; + const float ctr_x = (b0 / scale_before) + 0.5f * width; + const float ctr_y = (b1 / scale_before) + 0.5f * height; + for (size_t j = 0; j < num_classes; ++j) + { + // Extract deltas + const size_t delta_id = id.y() * deltas_width + 4u * j; + const float dx = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0]; + const float dy = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1]; + float dw = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2]; + float dh = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3]; + // Clip dw and dh + dw = std::min(dw, bbinfo.bbox_xform_clip()); + dh = std::min(dh, bbinfo.bbox_xform_clip()); + // Determine the predictions + const float pred_ctr_x = dx * width + ctr_x; + const float pred_ctr_y = dy * height + ctr_y; + const float pred_w = std::exp(dw) * width; + const float pred_h = std::exp(dh) * height; + // Store the prediction into the output tensor + pred_ptr[delta_id] = quantize_qasymm16( + scale_after * utility::clamp(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo); + pred_ptr[delta_id + 1] = quantize_qasymm16( + scale_after * utility::clamp(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo); + pred_ptr[delta_id + 2] = quantize_qasymm16( + scale_after * utility::clamp(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), + pred_qinfo); + pred_ptr[delta_id + 3] = quantize_qasymm16( + scale_after * utility::clamp(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), + pred_qinfo); + } + }, + box_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h index 7f990396df..d8013c6227 100644 --- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h @@ -30,7 +30,11 @@ namespace arm_compute namespace cpu { template -void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +void bounding_box_transform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) { const size_t num_classes = deltas->info()->tensor_shape()[0] >> 2; const size_t deltas_width = deltas->info()->tensor_shape()[0]; @@ -46,44 +50,53 @@ void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITe auto delta_ptr = reinterpret_cast(deltas->buffer() + deltas->info()->offset_first_element_in_bytes()); Iterator box_it(boxes, window); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto ptr = reinterpret_cast(box_it.ptr()); - const auto b0 = *ptr; - const auto b1 = *(ptr + 1); - const auto b2 = *(ptr + 2); - const auto b3 = *(ptr + 3); - const T width = (b2 / scale_before) - (b0 / scale_before) + T(1.f); - const T height = (b3 / scale_before) - (b1 / scale_before) + T(1.f); - const T ctr_x = (b0 / scale_before) + T(0.5f) * width; - const T ctr_y = (b1 / scale_before) + T(0.5f) * height; - for(size_t j = 0; j < num_classes; ++j) + execute_window_loop( + window, + [&](const Coordinates &id) { - // Extract deltas - const size_t delta_id = id.y() * deltas_width + 4u * j; - const T dx = delta_ptr[delta_id] / T(bbinfo.weights()[0]); - const T dy = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]); - T dw = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]); - T dh = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]); - // Clip dw and dh - dw = std::min(dw, T(bbinfo.bbox_xform_clip())); - dh = std::min(dh, T(bbinfo.bbox_xform_clip())); - // Determine the predictions - const T pred_ctr_x = dx * width + ctr_x; - const T pred_ctr_y = dy * height + ctr_y; - const T pred_w = std::exp(dw) * width; - const T pred_h = std::exp(dh) * height; - // Store the prediction into the output tensor - pred_ptr[delta_id] = scale_after * utility::clamp(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1)); - pred_ptr[delta_id + 1] = scale_after * utility::clamp(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1)); - pred_ptr[delta_id + 2] = scale_after * utility::clamp(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1)); - pred_ptr[delta_id + 3] = scale_after * utility::clamp(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1)); - } - }, - box_it); + const auto ptr = reinterpret_cast(box_it.ptr()); + const auto b0 = *ptr; + const auto b1 = *(ptr + 1); + const auto b2 = *(ptr + 2); + const auto b3 = *(ptr + 3); + const T width = (b2 / scale_before) - (b0 / scale_before) + T(1.f); + const T height = (b3 / scale_before) - (b1 / scale_before) + T(1.f); + const T ctr_x = (b0 / scale_before) + T(0.5f) * width; + const T ctr_y = (b1 / scale_before) + T(0.5f) * height; + for (size_t j = 0; j < num_classes; ++j) + { + // Extract deltas + const size_t delta_id = id.y() * deltas_width + 4u * j; + const T dx = delta_ptr[delta_id] / T(bbinfo.weights()[0]); + const T dy = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]); + T dw = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]); + T dh = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]); + // Clip dw and dh + dw = std::min(dw, T(bbinfo.bbox_xform_clip())); + dh = std::min(dh, T(bbinfo.bbox_xform_clip())); + // Determine the predictions + const T pred_ctr_x = dx * width + ctr_x; + const T pred_ctr_y = dy * height + ctr_y; + const T pred_w = std::exp(dw) * width; + const T pred_h = std::exp(dh) * height; + // Store the prediction into the output tensor + pred_ptr[delta_id] = scale_after * utility::clamp(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1)); + pred_ptr[delta_id + 1] = + scale_after * utility::clamp(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1)); + pred_ptr[delta_id + 2] = + scale_after * utility::clamp(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1)); + pred_ptr[delta_id + 3] = + scale_after * utility::clamp(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1)); + } + }, + box_it); } -void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window); +void bounding_box_transform_qsymm16(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window); } // namespace cpu } // namespace arm_compute #endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp index b27c187df3..64ef815195 100644 --- a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp @@ -26,7 +26,11 @@ namespace arm_compute { namespace cpu { -void neon_qu16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +void neon_qu16_boundingboxtransform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) { return bounding_box_transform_qsymm16(boxes, pred_boxes, deltas, bbinfo, window); } diff --git a/src/cpu/kernels/boundingboxtransform/list.h b/src/cpu/kernels/boundingboxtransform/list.h index 8f06acc8a6..4da725a257 100644 --- a/src/cpu/kernels/boundingboxtransform/list.h +++ b/src/cpu/kernels/boundingboxtransform/list.h @@ -27,8 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name) \ - void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name) \ + void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, \ + const Window &window) DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp32_boundingboxtransform); DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp16_boundingboxtransform); DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_qu16_boundingboxtransform); diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp index 6cd0c8500b..2897f4b242 100644 --- a/src/cpu/kernels/cast/generic/neon/fp16.cpp +++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/CPP/CPPTypes.h" #include "arm_compute/core/TensorInfo.h" -#include "src/cpu/kernels/CpuCastKernel.h" + #include "src/cpu/kernels/cast/list.h" +#include "src/cpu/kernels/CpuCastKernel.h" #include "support/SaturateCast.h" #include "arm_neon.h" @@ -35,7 +36,8 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_qasymm8_signed_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -49,42 +51,39 @@ void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + int x = window_start_x; - const int16x8x2_t texels = + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; - vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); - vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); - } + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + const int16x8x2_t texels = {{vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; + vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); } -void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_s32_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -98,44 +97,41 @@ void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vcvtq_f32_s32(vld1q_s32(src_ptr + x)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12)) - } - }; - - vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); - vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); - } + const float32x4x4_t texels = { + {vcvtq_f32_s32(vld1q_s32(src_ptr + x)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)), + vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); } -void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_fp32_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -149,44 +145,40 @@ void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_f32(src_ptr + x), - vld1q_f32(src_ptr + x + 4), - vld1q_f32(src_ptr + x + 8), - vld1q_f32(src_ptr + x + 12) - } - }; - - vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); - vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); - } + const float32x4x4_t texels = {{vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), + vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12)}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); } -void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_fp16_to_other_dt_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -200,142 +192,133 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::QASYMM8_SIGNED: { /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float16x8x2_t texels = {{ vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8), - } - }; + }}; - vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1])))); - } + vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), + vqmovn_s16(vcvtq_s16_f16(texels.val[1])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8: case DataType::U8: { /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float16x8x2_t texels = {{ vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8), - } - }; + }}; - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } + vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), + vqmovun_s16(vcvtq_s16_f16(texels.val[1])))); + } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F32: { /* Up-conversion F16 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_f16(src_ptr + x), - vld1q_f16(src_ptr + x + 8) - } - }; - vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0]))); - vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0]))); - vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1]))); - vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}}; + vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0]))); + vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0]))); + vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1]))); + vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::S32: { /* Up-conversion F16 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_f16(src_ptr + x), - vld1q_f16(src_ptr + x + 8) - } - }; - - vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])))); - vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])))); - vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])))); - vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}}; + + vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])))); + vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])))); + vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])))); + vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -343,7 +326,8 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread } } -void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_u8_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -357,40 +341,37 @@ void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo & ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); /* Up-conversion U8 -> F16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; - vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); - vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); - } + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; + vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); return; } diff --git a/src/cpu/kernels/cast/list.h b/src/cpu/kernels/cast/list.h index ffd82d5bf3..5e634fc170 100644 --- a/src/cpu/kernels/cast/list.h +++ b/src/cpu/kernels/cast/list.h @@ -27,8 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_CAST_KERNEL(func_name) \ - void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window) +#define DECLARE_CAST_KERNEL(func_name) \ + void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, \ + const Window &window) DECLARE_CAST_KERNEL(neon_fp32_to_fp16_cast); DECLARE_CAST_KERNEL(neon_u8_to_fp16_cast); @@ -41,4 +42,4 @@ DECLARE_CAST_KERNEL(neon_bfloat16_to_fp32_cast); #undef DECLARE_CAST_KERNEL } // namespace cpu } // namespace arm_compute -#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H \ No newline at end of file +#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H diff --git a/src/cpu/kernels/conv3d/neon/list.h b/src/cpu/kernels/conv3d/neon/list.h index 3bfa124dc3..082c60be29 100644 --- a/src/cpu/kernels/conv3d/neon/list.h +++ b/src/cpu/kernels/conv3d/neon/list.h @@ -27,8 +27,9 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" #include "arm_compute/runtime/FunctionDescriptors.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/conv3d/neon/quantized.h" namespace arm_compute @@ -36,7 +37,12 @@ namespace arm_compute namespace cpu { template -void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window) +void directconv3d_float_neon_ndhwc(const ITensor *src0, + const ITensor *src1, + const ITensor *src2, + ITensor *dst, + const Conv3dInfo &conv_info, + const Window &window) { const ITensor *src = src0; const ITensor *weights = src1; @@ -88,91 +94,104 @@ void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, con Iterator wei(weights, window_w); const T *biases_ptr = nullptr; - if(biases != nullptr) + if (biases != nullptr) { biases_ptr = reinterpret_cast(biases->buffer() + biases->info()->offset_first_element_in_bytes()); } - execute_window_loop(window_out, [&](const Coordinates & id) - { - // We are computing the theoretical input starting points - const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; - const int in_d_start_t = static_cast(id[3]) * conv_stride_d - conv_pad_front; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - const int in_d_end_t = in_d_start_t + kernel_dim_d; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_d_start = std::max(in_d_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - const int in_d_end = std::min(in_d_end_t, input_dim_d); - - // We use the input points to select the valid weight points to use - const int wei_w_start = in_w_start - in_w_start_t; - const int wei_h_start = in_h_start - in_h_start_t; - const int wei_d_start = in_d_start - in_d_start_t; - const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); - const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end); - - const int index_c_out_end = weights->info()->dimension(0); - const int index_c_in_end = weights->info()->dimension(1); - const T *const in_ptr_start = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n; - - execute_window_loop(window_w, [&](const Coordinates & id_w) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - /* + // We are computing the theoretical input starting points + const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; + const int in_d_start_t = static_cast(id[3]) * conv_stride_d - conv_pad_front; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + const int in_d_end_t = in_d_start_t + kernel_dim_d; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_d_start = std::max(in_d_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + const int in_d_end = std::min(in_d_end_t, input_dim_d); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_d_start = in_d_start - in_d_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end); + + const int index_c_out_end = weights->info()->dimension(0); + const int index_c_in_end = weights->info()->dimension(1); + const T *const in_ptr_start = + reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[4] * input_stride_n; + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + /* * This is the loop in the weights, and it goes along OFM (output feature map) */ - const auto weights_ptr_start = reinterpret_cast(wei.ptr()); - T out_temp = static_cast(0); - T *out_ptr = reinterpret_cast(out.ptr()); - for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d) - { - const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d; - const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d; - for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) - { - const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h; - const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h; - for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w) + const auto weights_ptr_start = reinterpret_cast(wei.ptr()); + T out_temp = static_cast(0); + T *out_ptr = reinterpret_cast(out.ptr()); + for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; + ++index_wei_d, ++index_in_d) { - const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; - const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; - int index_c_in = 0; - vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); - vector_type w_vec = wrapper::vdup_n(static_cast(0), tag_type()); - for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration; - index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) + const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d; + const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d; + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - //Load Cin weights - for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end) + const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h; + for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; + ++index_wei_w, ++index_in_w) { - w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k); + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c_in = 0; + vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); + vector_type w_vec = wrapper::vdup_n(static_cast(0), tag_type()); + for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration; + index_c_in += num_elems_read_per_iteration, + in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + //Load Cin weights + for (int k = 0; k < num_elems_read_per_iteration; + ++k, weights_ptr_mover += index_c_out_end) + { + w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k); + } + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_c_in < index_c_in_end; + ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_mover); + out_temp += src_val * w_val; + } } - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end) - { - const auto src_val = *(in_ptr_mover); - const auto w_val = *(weights_ptr_mover); - out_temp += src_val * w_val; } } - } - } - *(reinterpret_cast(out_ptr + id_w[0])) = (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp; + *(reinterpret_cast(out_ptr + id_w[0])) = + (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp; + }, + wei); }, - wei); - }, - out); + out); } } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H \ No newline at end of file +#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H diff --git a/src/cpu/kernels/conv3d/neon/quantized.h b/src/cpu/kernels/conv3d/neon/quantized.h index a8165b4944..f0fc9b5a71 100644 --- a/src/cpu/kernels/conv3d/neon/quantized.h +++ b/src/cpu/kernels/conv3d/neon/quantized.h @@ -28,16 +28,22 @@ #include "arm_compute/core/utils/misc/Traits.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/FunctionDescriptors.h" + +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/WindowHelpers.h" namespace arm_compute { namespace cpu { template -void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window) +void directconv3d_quantized_neon_ndhwc(const ITensor *src0, + const ITensor *src1, + const ITensor *src2, + ITensor *dst, + const Conv3dInfo &conv_info, + const Window &window) { const ITensor *src = src0; const ITensor *weights = src1; @@ -104,153 +110,166 @@ void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1, Iterator wei(weights, window_w); const int32_t *biases_ptr = nullptr; - if(biases != nullptr) + if (biases != nullptr) { biases_ptr = reinterpret_cast(biases->buffer() + biases->info()->offset_first_element_in_bytes()); } - execute_window_loop(window_out, [&](const Coordinates & id) - { - // We are computing the theoretical input starting points - const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; - const int in_d_start_t = static_cast(id[3]) * conv_stride_d - conv_pad_front; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - const int in_d_end_t = in_d_start_t + kernel_dim_d; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // We are computing the theoretical input starting points + const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; + const int in_d_start_t = static_cast(id[3]) * conv_stride_d - conv_pad_front; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + const int in_d_end_t = in_d_start_t + kernel_dim_d; - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_d_start = std::max(in_d_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - const int in_d_end = std::min(in_d_end_t, input_dim_d); + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_d_start = std::max(in_d_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + const int in_d_end = std::min(in_d_end_t, input_dim_d); - // We use the input points to select the valid weight points to use - const int wei_w_start = in_w_start - in_w_start_t; - const int wei_h_start = in_h_start - in_h_start_t; - const int wei_d_start = in_d_start - in_d_start_t; - const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); - const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end); + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_d_start = in_d_start - in_d_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end); - const int index_c_out_end = weights->info()->dimension(0); - const int index_c_in_end = weights->info()->dimension(1); - const T *const in_ptr_start = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n; + const int index_c_out_end = weights->info()->dimension(0); + const int index_c_in_end = weights->info()->dimension(1); + const T *const in_ptr_start = + reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[4] * input_stride_n; - execute_window_loop(window_w, [&](const Coordinates & id_w) - { - /* + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + /* * This is the loop in the weights, and it goes along OFM (output feature map) */ - const auto weights_ptr_start = reinterpret_cast(wei.ptr()); - int32_t acc = static_cast(0); - T *out_ptr = reinterpret_cast(out.ptr()); - for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d) - { - const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d; - const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d; - for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) - { - const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h; - const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h; - for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w) + const auto weights_ptr_start = reinterpret_cast(wei.ptr()); + int32_t acc = static_cast(0); + T *out_ptr = reinterpret_cast(out.ptr()); + for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; + ++index_wei_d, ++index_in_d) { - const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; - const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; - int index_c_in = 0; - vector_type w_vec = wrapper::vdup_n(static_cast(0), tag_type()); - - q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast(0), tag_type()); - q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast(0), tag_type()); - q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast(0), tag_type()); - q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast(0), tag_type()); - - for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration; - index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) + const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d; + const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d; + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - //Load Cin weights - for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end) + const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h; + for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; + ++index_wei_w, ++index_in_w) { - w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k); - } - q32x4_t src_q32_0 = wrapper::vdup_n(static_cast(input_offset), tag_type()); - q32x4_t src_q32_1 = wrapper::vdup_n(static_cast(input_offset), tag_type()); - q32x4_t src_q32_2 = wrapper::vdup_n(static_cast(input_offset), tag_type()); - q32x4_t src_q32_3 = wrapper::vdup_n(static_cast(input_offset), tag_type()); + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c_in = 0; + vector_type w_vec = wrapper::vdup_n(static_cast(0), tag_type()); - q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast(weights_offset), tag_type()); - q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast(weights_offset), tag_type()); - q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast(weights_offset), tag_type()); - q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast(weights_offset), tag_type()); + q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast(0), tag_type()); + q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast(0), tag_type()); + q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast(0), tag_type()); + q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast(0), tag_type()); - const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec)); - const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec)); - const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec)); - const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec)); + for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration; + index_c_in += num_elems_read_per_iteration, + in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + //Load Cin weights + for (int k = 0; k < num_elems_read_per_iteration; + ++k, weights_ptr_mover += index_c_out_end) + { + w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k); + } + q32x4_t src_q32_0 = wrapper::vdup_n(static_cast(input_offset), tag_type()); + q32x4_t src_q32_1 = wrapper::vdup_n(static_cast(input_offset), tag_type()); + q32x4_t src_q32_2 = wrapper::vdup_n(static_cast(input_offset), tag_type()); + q32x4_t src_q32_3 = wrapper::vdup_n(static_cast(input_offset), tag_type()); - src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0))); - src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0))); - src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1))); - src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1))); + q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast(weights_offset), tag_type()); + q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast(weights_offset), tag_type()); + q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast(weights_offset), tag_type()); + q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast(weights_offset), tag_type()); - wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0))); - wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0))); - wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1))); - wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1))); + const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec)); + const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec)); + const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec)); + const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec)); - acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0); - acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1); - acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2); - acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3); - } + src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0))); + src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0))); + src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1))); + src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1))); + + wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0))); + wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0))); + wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1))); + wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1))); + + acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0); + acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1); + acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2); + acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3); + } #if defined(__aarch64__) - acc += wrapper::vaddv(acc_q32_0); - acc += wrapper::vaddv(acc_q32_1); - acc += wrapper::vaddv(acc_q32_2); - acc += wrapper::vaddv(acc_q32_3); + acc += wrapper::vaddv(acc_q32_0); + acc += wrapper::vaddv(acc_q32_1); + acc += wrapper::vaddv(acc_q32_2); + acc += wrapper::vaddv(acc_q32_3); #else // __aarch64__ - auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0)); - temp = wrapper::vpadd(temp, temp); - acc += wrapper::vgetlane(temp, 0); + auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); - temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1)); - temp = wrapper::vpadd(temp, temp); - acc += wrapper::vgetlane(temp, 0); + temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); - temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2)); - temp = wrapper::vpadd(temp, temp); - acc += wrapper::vgetlane(temp, 0); + temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); - temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3)); - temp = wrapper::vpadd(temp, temp); - acc += wrapper::vgetlane(temp, 0); + temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); #endif // __aarch64__ - for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end) - { - const auto src_val = *(in_ptr_mover) + input_offset; - const auto w_val = *(weights_ptr_mover) + weights_offset; - acc += src_val * w_val; + for (; index_c_in < index_c_in_end; + ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end) + { + const auto src_val = *(in_ptr_mover) + input_offset; + const auto w_val = *(weights_ptr_mover) + weights_offset; + acc += src_val * w_val; + } + } } } - } - } - if(biases) - { - acc += *reinterpret_cast(biases_ptr + id_w[0]); - } + if (biases) + { + acc += *reinterpret_cast(biases_ptr + id_w[0]); + } - T out_val = finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false); - *(reinterpret_cast(out_ptr + id_w[0])) = out_val; + T out_val = + finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false); + *(reinterpret_cast(out_ptr + id_w[0])) = out_val; + }, + wei); }, - wei); - }, - out); + out); } } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H \ No newline at end of file +#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H diff --git a/src/cpu/kernels/crop/generic/neon/crop_helper.h b/src/cpu/kernels/crop/generic/neon/crop_helper.h index 1fe8e11e98..8fb7ad2087 100644 --- a/src/cpu/kernels/crop/generic/neon/crop_helper.h +++ b/src/cpu/kernels/crop/generic/neon/crop_helper.h @@ -80,7 +80,7 @@ inline float32x4_t load_as_f32(uint8_t *ptr) { return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr))))); } -} +} // namespace cpu } // namespace arm_compute -#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H \ No newline at end of file +#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H diff --git a/src/cpu/kernels/crop/generic/neon/fp16.cpp b/src/cpu/kernels/crop/generic/neon/fp16.cpp index 218ebba191..3739c9d4e0 100644 --- a/src/cpu/kernels/crop/generic/neon/fp16.cpp +++ b/src/cpu/kernels/crop/generic/neon/fp16.cpp @@ -29,12 +29,19 @@ namespace arm_compute { namespace cpu { -void fp16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void fp16_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); -} + return in_bounds_crop_window(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } +} // namespace cpu } // namespace arm_compute #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/crop/generic/neon/fp32.cpp b/src/cpu/kernels/crop/generic/neon/fp32.cpp index 16d0218fce..f665c3652c 100644 --- a/src/cpu/kernels/crop/generic/neon/fp32.cpp +++ b/src/cpu/kernels/crop/generic/neon/fp32.cpp @@ -28,11 +28,18 @@ namespace arm_compute { namespace cpu { -void fp32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void fp32_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); -} + return in_bounds_crop_window(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/crop/generic/neon/impl.h b/src/cpu/kernels/crop/generic/neon/impl.h index a59588be45..b90ba9ddbf 100644 --- a/src/cpu/kernels/crop/generic/neon/impl.h +++ b/src/cpu/kernels/crop/generic/neon/impl.h @@ -26,8 +26,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/crop/generic/neon/crop_helper.h" namespace arm_compute @@ -35,19 +36,26 @@ namespace arm_compute namespace cpu { template -void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { // Reverse elements if width flipped. - if(is_width_flipped) + if (is_width_flipped) { // Collapse first dimension if possible. - if(input_has_single_channel) + if (input_has_single_channel) { int32_t x = output_width_start; Coordinates negative_offset(input_offset); negative_offset.set(1, negative_offset[1] - window_step_x + 1); - for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x) + for (; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x) { auto in = load_as_f32(reinterpret_cast(input->ptr_to_element(negative_offset))); @@ -57,25 +65,27 @@ void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *o wrapper::vstore(output_ptr + x, in); } input_offset[1] = negative_offset[1] + window_step_x - 1; - for(; x < output_width_limit; ++x, --input_offset[1]) + for (; x < output_width_limit; ++x, --input_offset[1]) { *(output_ptr + x) = static_cast(*reinterpret_cast(input->ptr_to_element(input_offset))); } } else { - for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1]) + for (int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1]) { input_offset.set(0, 0); int32_t c = 0; - for(; c <= static_cast(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x) + for (; c <= static_cast(input->info()->dimension(0)) - window_step_x; + c += window_step_x, input_offset[0] += window_step_x) { auto in = load_as_f32(reinterpret_cast(input->ptr_to_element(input_offset))); wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in); } - for(; c < static_cast(input->info()->dimension(0)); ++c, ++input_offset[0]) + for (; c < static_cast(input->info()->dimension(0)); ++c, ++input_offset[0]) { - *(output_ptr + x * output->info()->dimension(0) + c) = static_cast(*reinterpret_cast(input->ptr_to_element(input_offset))); + *(output_ptr + x * output->info()->dimension(0) + c) = + static_cast(*reinterpret_cast(input->ptr_to_element(input_offset))); } } } @@ -83,25 +93,28 @@ void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *o else { // Use memcpy if the elements don't need converting to float. - if(std::is_same::value) + if (std::is_same::value) { memcpy(static_cast(output_ptr + output_width_start * output->info()->dimension(0)), reinterpret_cast(input->ptr_to_element(input_offset)), - (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size()); + (output_width_limit - output_width_start) * output->info()->dimension(0) * + output->info()->element_size()); } else { - int32_t x = 0; - int32_t limit = (output_width_limit - output_width_start) * static_cast(output->info()->dimension(0)); + int32_t x = 0; + int32_t limit = + (output_width_limit - output_width_start) * static_cast(output->info()->dimension(0)); float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0); - for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x) + for (; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x) { auto in = load_as_f32(reinterpret_cast(input->ptr_to_element(input_offset))); wrapper::vstore(output_start_ptr + x, in); } - for(; x < limit; ++x, ++input_offset[0]) + for (; x < limit; ++x, ++input_offset[0]) { - *(output_start_ptr + x) = static_cast(*reinterpret_cast(input->ptr_to_element(input_offset))); + *(output_start_ptr + x) = + static_cast(*reinterpret_cast(input->ptr_to_element(input_offset))); } } } diff --git a/src/cpu/kernels/crop/generic/neon/integer.cpp b/src/cpu/kernels/crop/generic/neon/integer.cpp index ebf2c1fbd3..602434f54f 100644 --- a/src/cpu/kernels/crop/generic/neon/integer.cpp +++ b/src/cpu/kernels/crop/generic/neon/integer.cpp @@ -29,46 +29,88 @@ namespace arm_compute { namespace cpu { -void u8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void u8_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); + return in_bounds_crop_window(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } -void u16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void u16_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); + return in_bounds_crop_window(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } -void u32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void u32_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); + return in_bounds_crop_window(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } -void s8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void s8_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); + return in_bounds_crop_window(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } -void s16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void s16_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); + return in_bounds_crop_window(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } -void s32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void s32_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); -} + return in_bounds_crop_window(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/crop/list.h b/src/cpu/kernels/crop/list.h index a6b83215ae..9cb7726203 100644 --- a/src/cpu/kernels/crop/list.h +++ b/src/cpu/kernels/crop/list.h @@ -26,8 +26,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/crop/generic/neon/impl.h" namespace arm_compute @@ -36,7 +37,8 @@ namespace cpu { #define DECLARE_CROP_KERNEL(func_name) \ void func_name(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, \ - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) + int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, \ + bool input_has_single_channel, bool is_width_flipped) DECLARE_CROP_KERNEL(fp16_in_bounds_crop_window); DECLARE_CROP_KERNEL(fp32_in_bounds_crop_window); diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp index e85a1664ea..293e606d81 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp @@ -29,11 +29,16 @@ namespace arm_compute { namespace cpu { -void neon_fp16_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_fp16_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_float(src, weights, bias, dst, window, has_biases, info); } -} +} // namespace cpu } // namespace arm_compute #endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp index b2333a3334..c6fa4790b7 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp @@ -26,10 +26,15 @@ namespace arm_compute { namespace cpu { -void neon_fp32_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_fp32_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_float(src, weights, bias, dst, window, has_biases, info); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp index a2ae5564e6..d08e973968 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h" + #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/function_info/ConvolutionInfo.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -65,8 +67,16 @@ inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent) namespace { template -void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, std::vector output_multiplier, std::vector output_shift, const Window &window, bool has_biases) // NOLINT +void depthwise_loop_multiplier1_quantized(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + std::vector output_multiplier, + std::vector output_shift, + const Window &window, + bool has_biases) // NOLINT { ARM_COMPUTE_UNUSED(output_multiplier, output_shift); constexpr auto element_per_vector = vector_size / sizeof(T); @@ -75,7 +85,8 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei using AccType = int32_t; using AccArrayType = std::array; - const auto out_of_bound_value = PixelValue(static_cast(0), src->info()->data_type(), src->info()->quantization_info()).get(); + const auto out_of_bound_value = + PixelValue(static_cast(0), src->info()->data_type(), src->info()->quantization_info()).get(); const auto out_of_bound_vector = wrapper::vdup_n(static_cast(out_of_bound_value), TagType{}); const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window); @@ -104,152 +115,175 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } - execute_window_loop(execution_window, [&](const Coordinates & id) - { - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - auto const base_weights_ptr = weights_it.ptr(); - size_t x = run_info.x_start; - - for(; x < run_info.x_leftover_start; x += run_info.x_step) + execute_window_loop( + execution_window, + [&](const Coordinates &id) { - AccArrayType acc{}; - AccArrayType in_sum{}; - AccArrayType we_sum{}; + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + auto const base_weights_ptr = weights_it.ptr(); + size_t x = run_info.x_start; - auto weights_ptr = base_weights_ptr; - auto input_offset = base_input_offset; - - for(size_t h = 0; h < run_info.weights_height; ++h) + for (; x < run_info.x_leftover_start; x += run_info.x_step) { - int64_t offs = input_offset + x * sizeof(T); - for(size_t w = 0; w < run_info.weights_width; ++w) - { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_vals = is_valid_region ? - wrapper::vload(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))) : - out_of_bound_vector; - const auto weights_vals = wrapper::vload(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); + AccArrayType acc{}; + AccArrayType in_sum{}; + AccArrayType we_sum{}; + + auto weights_ptr = base_weights_ptr; + auto input_offset = base_input_offset; - for(size_t i = 0; i < element_per_vector; ++i) + for (size_t h = 0; h < run_info.weights_height; ++h) + { + int64_t offs = input_offset + x * sizeof(T); + for (size_t w = 0; w < run_info.weights_width; ++w) { - acc.at(i) += input_vals[i] * weights_vals[i]; - in_sum.at(i) += input_vals[i]; - we_sum.at(i) += weights_vals[i]; + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = + is_valid_region + ? wrapper::vload(reinterpret_cast( + input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))) + : out_of_bound_vector; + const auto weights_vals = + wrapper::vload(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); + + for (size_t i = 0; i < element_per_vector; ++i) + { + acc.at(i) += input_vals[i] * weights_vals[i]; + in_sum.at(i) += input_vals[i]; + we_sum.at(i) += weights_vals[i]; + } + + offs += dilation.x() * run_info.input_stride_y; } - offs += dilation.x() * run_info.input_stride_y; + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } + VectorType out_vals = wrapper::vdup_n(static_cast(0), TagType{}); + for (size_t i = 0; i < element_per_vector; ++i) + { + acc.at(i) -= in_sum.at(i) * weights_qoffset; + acc.at(i) -= we_sum.at(i) * input_qoffset; + acc.at(i) += k_offset; - VectorType out_vals = wrapper::vdup_n(static_cast(0), TagType{}); - for(size_t i = 0; i < element_per_vector; ++i) - { - acc.at(i) -= in_sum.at(i) * weights_qoffset; - acc.at(i) -= we_sum.at(i) * input_qoffset; - acc.at(i) += k_offset; + if (has_biases) + { + acc.at(i) += *(reinterpret_cast(biases_it.ptr() + i * sizeof(int32_t)) + x); + } - if(has_biases) - { - acc.at(i) += *(reinterpret_cast(biases_it.ptr() + i * sizeof(int32_t)) + x); + const int32_t out_mul = output_multiplier.at(x + i); + const int32_t out_shift = output_shift.at(x + i); + if (out_shift < 0) + { + acc.at(i) = + saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc.at(i) = + rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + + output_qoffset; + } + out_vals[i] = static_cast(utility::clamp(acc.at(i))); } - const int32_t out_mul = output_multiplier.at(x + i); - const int32_t out_shift = output_shift.at(x + i); - if(out_shift < 0) - { - acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset; - } - else - { - acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset; - } - out_vals[i] = static_cast(utility::clamp(acc.at(i))); + wrapper::vstore(reinterpret_cast(output_it.ptr()) + x, out_vals); } - wrapper::vstore(reinterpret_cast(output_it.ptr()) + x, out_vals); - } - - // left-over - for(; x < run_info.x_end; ++x) - { - AccType acc = 0; - AccType in_sum = 0; - AccType we_sum = 0; + // left-over + for (; x < run_info.x_end; ++x) + { + AccType acc = 0; + AccType in_sum = 0; + AccType we_sum = 0; - auto weights_ptr = base_weights_ptr; - auto input_offset = base_input_offset; + auto weights_ptr = base_weights_ptr; + auto input_offset = base_input_offset; - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int64_t offs = input_offset + x * sizeof(T); - for(size_t w = 0; w < run_info.weights_width; ++w) + for (size_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_val = is_valid_region ? - *reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset)) : - out_of_bound_value; - const auto weights_val = *(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); - - acc += input_val * weights_val; - in_sum += input_val; - we_sum += weights_val; + int64_t offs = input_offset + x * sizeof(T); + for (size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = + is_valid_region + ? *reinterpret_cast(input_it.ptr() + + std::min(static_cast(offs), run_info.input_max_offset)) + : out_of_bound_value; + const auto weights_val = + *(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); + + acc += input_val * weights_val; + in_sum += input_val; + we_sum += weights_val; + + offs += dilation.x() * run_info.input_stride_y; + } - offs += dilation.x() * run_info.input_stride_y; + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } + T out_vals{0}; - T out_vals{ 0 }; + acc -= in_sum * weights_qoffset; + acc -= we_sum * input_qoffset; + acc += k_offset; - acc -= in_sum * weights_qoffset; - acc -= we_sum * input_qoffset; - acc += k_offset; + if (has_biases) + { + acc += *(reinterpret_cast(biases_it.ptr()) + x); + } - if(has_biases) - { - acc += *(reinterpret_cast(biases_it.ptr()) + x); - } + const int32_t out_mul = output_multiplier.at(x); + const int32_t out_shift = output_shift.at(x); - const int32_t out_mul = output_multiplier.at(x); - const int32_t out_shift = output_shift.at(x); + if (out_shift < 0) + { + acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc = + rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset; + } - if(out_shift < 0) - { - acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset; - } - else - { - acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset; + out_vals = static_cast(utility::clamp(acc)); + *(reinterpret_cast(output_it.ptr()) + x) = out_vals; } - - out_vals = static_cast(utility::clamp(acc)); - *(reinterpret_cast(output_it.ptr()) + x) = out_vals; - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } template -void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, std::vector output_multiplier, std::vector output_shift, const Window &window, bool has_biases) // NOLINT +void depthwise_loop_generic_quantized(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + std::vector output_multiplier, + std::vector output_shift, + const Window &window, + bool has_biases) // NOLINT { using AccType = int32_t; - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + const auto run_info = + DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); - const auto out_of_bound_value = PixelValue(static_cast(0), src->info()->data_type(), src->info()->quantization_info()).get(); + const auto out_of_bound_value = + PixelValue(static_cast(0), src->info()->data_type(), src->info()->quantization_info()).get(); const int32_t input_qoffset = src->info()->quantization_info().uniform().offset; const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset; @@ -277,76 +311,93 @@ void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } - execute_window_loop(execution_window, [&](const Coordinates & id) - { - std::vector acc(depth_multiplier, 0); - std::vector we_sum(depth_multiplier, 0); - AccType in_sum = 0; + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + std::vector acc(depth_multiplier, 0); + std::vector we_sum(depth_multiplier, 0); + AccType in_sum = 0; - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - auto weights_ptr = weights_it.ptr(); - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int offs = input_offset; - for(size_t w = 0; w < run_info.weights_width; ++w) + auto weights_ptr = weights_it.ptr(); + for (size_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_val = is_valid_region ? *(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))) : out_of_bound_value; - - for(size_t m = 0; m < depth_multiplier; ++m) + int offs = input_offset; + for (size_t w = 0; w < run_info.weights_width; ++w) { - const auto weights_val = *(reinterpret_cast(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); - acc.at(m) += input_val * weights_val; - - we_sum.at(m) += weights_val; - } + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = + is_valid_region ? *(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), + run_info.input_max_offset))) + : out_of_bound_value; - offs += dilation.x() * run_info.input_stride_y; - in_sum += input_val; - } + for (size_t m = 0; m < depth_multiplier; ++m) + { + const auto weights_val = + *(reinterpret_cast(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + acc.at(m) += input_val * weights_val; - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } + we_sum.at(m) += weights_val; + } - for(size_t m = 0; m < depth_multiplier; ++m) - { - acc.at(m) -= in_sum * weights_qoffset; - acc.at(m) -= we_sum.at(m) * input_qoffset; - acc.at(m) += k_offset; + offs += dilation.x() * run_info.input_stride_y; + in_sum += input_val; + } - if(has_biases) - { - acc.at(m) += *(reinterpret_cast(biases_it.ptr() + m * sizeof(int32_t))); + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m); - const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m); - if(out_shift < 0) - { - acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset; - } - else + for (size_t m = 0; m < depth_multiplier; ++m) { - acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset; + acc.at(m) -= in_sum * weights_qoffset; + acc.at(m) -= we_sum.at(m) * input_qoffset; + acc.at(m) += k_offset; + + if (has_biases) + { + acc.at(m) += *(reinterpret_cast(biases_it.ptr() + m * sizeof(int32_t))); + } + + const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m); + const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m); + if (out_shift < 0) + { + acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + + output_qoffset; + } + *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = + static_cast(utility::clamp(acc.at(m))); } - *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = static_cast(utility::clamp(acc.at(m))); - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } template -void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, std::vector output_multiplier, std::vector output_shift, const Window &window, bool has_biases) // NOLINT +void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + std::vector output_multiplier, + std::vector output_shift, + const Window &window, + bool has_biases) // NOLINT { constexpr int half_vec = vector_size / 2; @@ -355,11 +406,15 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor using AccVectorTagType = typename wrapper::traits::neon_vector::tag_type; using TagType = typename wrapper::traits::neon_vector::tag_type; - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + const auto run_info = + DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); - const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast(src->info()->quantization_info().uniform().offset), TagType{}))); - const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast(weights->info()->quantization_info().uniform().offset), TagType{}))); - const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{}); + const auto input_qoffset_vec = wrapper::vreinterpret( + wrapper::vmovl(wrapper::vdup_n(static_cast(src->info()->quantization_info().uniform().offset), TagType{}))); + const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl( + wrapper::vdup_n(static_cast(weights->info()->quantization_info().uniform().offset), TagType{}))); + const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, + arm_compute::wrapper::traits::vector_128_tag{}); const auto lower = wrapper::vdup_n(static_cast(std::numeric_limits::lowest()), AccVectorTagType{}); const auto upper = wrapper::vdup_n(static_cast(std::numeric_limits::max()), AccVectorTagType{}); @@ -389,7 +444,7 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } @@ -397,95 +452,117 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor std::vector acc0(depth_multiplier / vector_size); std::vector acc1(depth_multiplier / vector_size); - execute_window_loop(execution_window, [&](const Coordinates & id) - { - std::fill(begin(acc0), end(acc0), zero); - std::fill(begin(acc1), end(acc1), zero); + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + std::fill(begin(acc0), end(acc0), zero); + std::fill(begin(acc1), end(acc1), zero); - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - auto weights_ptr = weights_it.ptr(); - for(size_t h = 0; h < run_info.weights_height; ++h) - { - const int32_t current_h = input_z + h * dilation.y(); - if(current_h >= 0 && current_h < static_cast(run_info.input_height)) + auto weights_ptr = weights_it.ptr(); + for (size_t h = 0; h < run_info.weights_height; ++h) { - int offs = input_offset; - for(size_t w = 0; w < run_info.weights_width; ++w) + const int32_t current_h = input_z + h * dilation.y(); + if (current_h >= 0 && current_h < static_cast(run_info.input_height)) { - const int32_t current_w = input_y + w * dilation.x(); - if(current_w >= 0 && current_w < static_cast(run_info.input_width)) + int offs = input_offset; + for (size_t w = 0; w < run_info.weights_width; ++w) { - const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))), TagType{}); - const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8)); - const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec); - - for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) + const int32_t current_w = input_y + w * dilation.x(); + if (current_w >= 0 && current_w < static_cast(run_info.input_width)) { - const auto weights_8x8 = wrapper::vload(reinterpret_cast(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); - const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8)); - const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec); - - acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs)); - acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs)); + const auto input_8x8 = wrapper::vdup_n( + *(reinterpret_cast( + input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))), + TagType{}); + const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8)); + const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec); + + for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) + { + const auto weights_8x8 = wrapper::vload(reinterpret_cast( + weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8)); + const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec); + + acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), + wrapper::vgetlow(weights_no_offs)); + acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), + wrapper::vgethigh(weights_no_offs)); + } } - } - offs += dilation.x() * run_info.input_stride_y; + offs += dilation.x() * run_info.input_stride_y; + } } - } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } - for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) - { - if(has_biases) + for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) { - const auto bias_val0 = wrapper::vloadq(reinterpret_cast(biases_it.ptr() + m * sizeof(int32_t))); - const auto bias_val1 = wrapper::vloadq(reinterpret_cast(biases_it.ptr() + (m + half_vec) * sizeof(int32_t))); + if (has_biases) + { + const auto bias_val0 = + wrapper::vloadq(reinterpret_cast(biases_it.ptr() + m * sizeof(int32_t))); + const auto bias_val1 = wrapper::vloadq( + reinterpret_cast(biases_it.ptr() + (m + half_vec) * sizeof(int32_t))); - acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0); - acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1); - } + acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0); + acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1); + } - if(out_shift < 0) - { - acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec); - acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec); - } - else - { - acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec); - acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec); - } + if (out_shift < 0) + { + acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), + output_qoffset_vec); + acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), + output_qoffset_vec); + } + else + { + acc0.at(i) = wrapper::vadd( + rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), + output_qoffset_vec); + acc1.at(i) = wrapper::vadd( + rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), + output_qoffset_vec); + } - acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper); - acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper); + acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper); + acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper); - const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), - wrapper::vmovn(acc1.at(i))); + const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), wrapper::vmovn(acc1.at(i))); - if(std::is_same::value) - { - wrapper::vstore(reinterpret_cast(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val))); - } - else - { - wrapper::vstore(reinterpret_cast(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val)); + if (std::is_same::value) + { + wrapper::vstore(reinterpret_cast(output_it.ptr() + m * sizeof(uint8_t)), + wrapper::vqmovn(vreinterpretq_u16_s16(out_val))); + } + else + { + wrapper::vstore(reinterpret_cast(output_it.ptr() + m * sizeof(int8_t)), + wrapper::vqmovn(out_val)); + } } - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } } // namespace template -void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void run_depthwise_quanitized8bit(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { PadStrideInfo conv_info = info.pad_stride_info; unsigned int depth_multiplier = info.depth_multiplier; @@ -497,15 +574,15 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co const auto output_scale = dst->info()->quantization_info().uniform().scale; auto weights_scale = weights->info()->quantization_info().scale(); - if(!is_data_type_quantized_per_channel(weights->info()->data_type())) + if (!is_data_type_quantized_per_channel(weights->info()->data_type())) { - for(size_t i = 1; i < weights->info()->dimension(channel_idx); ++i) + for (size_t i = 1; i < weights->info()->dimension(channel_idx); ++i) { weights_scale.push_back(weights_scale.front()); } } - for(const auto &s : weights_scale) + for (const auto &s : weights_scale) { int32_t out_mult = 0; int32_t out_shift = 0; @@ -516,30 +593,49 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co output_shift.push_back(out_shift); } - if(depth_multiplier == 1) + if (depth_multiplier == 1) { - depthwise_loop_multiplier1_quantized(src, weights, biases, dst, conv_info, dilation, output_multiplier, output_shift, window, has_biases); + depthwise_loop_multiplier1_quantized(src, weights, biases, dst, conv_info, dilation, output_multiplier, + output_shift, window, has_biases); } else { const bool is_pow2 = ((depth_multiplier & (depth_multiplier - 1)) == 0); const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type())); - if(is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8) + if (is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8) { - depthwise_loop_pow2_quantized_per_tensor(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases); + depthwise_loop_pow2_quantized_per_tensor(src, weights, biases, dst, conv_info, dilation, + depth_multiplier, output_multiplier, output_shift, window, + has_biases); } else { - depthwise_loop_generic_quantized(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases); + depthwise_loop_generic_quantized(src, weights, biases, dst, conv_info, dilation, depth_multiplier, + output_multiplier, output_shift, window, has_biases); } } } -template void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info); -template void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info); -template void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info); +template void run_depthwise_quanitized8bit(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); +template void run_depthwise_quanitized8bit(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); +template void run_depthwise_quanitized8bit(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h index 8410cdbf16..3fa5c58c3c 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h @@ -24,6 +24,7 @@ #ifndef SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H #define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -63,15 +64,21 @@ struct DepthwiseConvolutionRunInfo const size_t input_width; const size_t input_depth; - DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT - : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)), + DepthwiseConvolutionRunInfo(const ITensorInfo &input, + const ITensorInfo &weights, + const PadStrideInfo &conv_info, + const Window &w, + uint32_t depth_multiplier = 1) // NOLINT + : num_read_elements_per_iteration( + (depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)), x_start(w.x().start()), x_end(w.x().end()), x_step(static_cast(num_read_elements_per_iteration * depth_multiplier)), x_leftover_start(std::max(static_cast(w.x().end() + 1) - static_cast(x_step), int32_t(0))), input_stride_y(input.strides_in_bytes().y()), input_stride_z(input.strides_in_bytes().z()), - input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()), + input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - + (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()), weights_width(weights.dimension(width_idx)), weights_height(weights.dimension(height_idx)), weights_stride_y(weights.strides_in_bytes().y()), @@ -87,7 +94,12 @@ struct DepthwiseConvolutionRunInfo } }; -inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation) +inline bool is_valid_input_region(int32_t base_w, + uint32_t base_h, + uint32_t w, + uint32_t h, + const DepthwiseConvolutionRunInfo &run_info, + const Size2D &dilation) { const int32_t current_h = base_h + h * dilation.y(); const bool is_valid_h = current_h >= 0 && current_h < static_cast(run_info.input_height); @@ -99,8 +111,14 @@ inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, u } template -void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, const Window &window, bool has_biases) +void depthwise_loop_multiplier1_fp(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const Window &window, + bool has_biases) { constexpr auto element_per_vector = vector_size / sizeof(T); using VectorType = typename wrapper::traits::neon_vector::type; @@ -129,94 +147,112 @@ void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, c Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } - execute_window_loop(execution_window, [&](const Coordinates & id) - { - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - - auto const base_weights_ptr = weights_it.ptr(); - uint32_t x = run_info.x_start; - - for(; x < run_info.x_leftover_start; x += run_info.x_step) + execute_window_loop( + execution_window, + [&](const Coordinates &id) { - VectorType acc = zero_vector; - auto weights_ptr = base_weights_ptr; - int64_t input_offset = base_input_offset; + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + + auto const base_weights_ptr = weights_it.ptr(); + uint32_t x = run_info.x_start; - for(uint32_t h = 0; h < run_info.weights_height; ++h) + for (; x < run_info.x_leftover_start; x += run_info.x_step) { - int64_t offs = input_offset + x * sizeof(T); - for(uint32_t w = 0; w < run_info.weights_width; ++w) + VectorType acc = zero_vector; + auto weights_ptr = base_weights_ptr; + int64_t input_offset = base_input_offset; + + for (uint32_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_vals = is_valid_region ? - wrapper::vload(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))) : - zero_vector; - const auto weights_vals = wrapper::vload(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); - acc = wrapper::vmla(acc, weights_vals, input_vals); + int64_t offs = input_offset + x * sizeof(T); + for (uint32_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = + is_valid_region + ? wrapper::vload(reinterpret_cast( + input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))) + : zero_vector; + const auto weights_vals = + wrapper::vload(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); + acc = wrapper::vmla(acc, weights_vals, input_vals); + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } - offs += dilation.x() * run_info.input_stride_y; + if (has_biases) + { + const auto biases_vals = wrapper::vload(reinterpret_cast(biases_it.ptr()) + x); + acc = wrapper::vadd(acc, biases_vals); } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; + wrapper::vstore(reinterpret_cast(output_it.ptr()) + x, acc); } - if(has_biases) + for (; x < run_info.x_end; ++x) { - const auto biases_vals = wrapper::vload(reinterpret_cast(biases_it.ptr()) + x); - acc = wrapper::vadd(acc, biases_vals); - } - - wrapper::vstore(reinterpret_cast(output_it.ptr()) + x, acc); - } + auto acc_scalar = T{0}; + auto weights_ptr = base_weights_ptr; + int64_t input_offset = base_input_offset; - for(; x < run_info.x_end; ++x) - { - auto acc_scalar = T{ 0 }; - auto weights_ptr = base_weights_ptr; - int64_t input_offset = base_input_offset; - - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int64_t offs = input_offset + x * sizeof(T); - for(size_t w = 0; w < run_info.weights_width; ++w) + for (size_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_vals = is_valid_region ? *reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset)) : 0; - const auto weights_vals = *(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); - - acc_scalar += (input_vals * weights_vals); - - offs += dilation.x() * run_info.input_stride_y; + int64_t offs = input_offset + x * sizeof(T); + for (size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = + is_valid_region + ? *reinterpret_cast(input_it.ptr() + + std::min(static_cast(offs), run_info.input_max_offset)) + : 0; + const auto weights_vals = + *(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); + + acc_scalar += (input_vals * weights_vals); + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - if(has_biases) - { - const auto biases_vals = *(reinterpret_cast(biases_it.ptr()) + x); - acc_scalar += biases_vals; + if (has_biases) + { + const auto biases_vals = *(reinterpret_cast(biases_it.ptr()) + x); + acc_scalar += biases_vals; + } + *(reinterpret_cast(output_it.ptr()) + x) = acc_scalar; } - *(reinterpret_cast(output_it.ptr()) + x) = acc_scalar; - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } template -void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases) +void depthwise_loop_generic_fp(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + const Window &window, + bool has_biases) { - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + const auto run_info = + DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); Window execution_window = window; execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); @@ -240,81 +276,98 @@ void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } - execute_window_loop(execution_window, [&](const Coordinates & id) - { - std::vector acc(depth_multiplier, static_cast(0)); + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + std::vector acc(depth_multiplier, static_cast(0)); - const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - auto weights_ptr = weights_it.ptr(); - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int offs = input_offset; - for(size_t w = 0; w < run_info.weights_width; ++w) + auto weights_ptr = weights_it.ptr(); + for (size_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_val = is_valid_region ? *(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))) : T(0); - - for(size_t m = 0; m < depth_multiplier; ++m) + int offs = input_offset; + for (size_t w = 0; w < run_info.weights_width; ++w) { - const auto weights_val = *(reinterpret_cast(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); - acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m)); + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = + is_valid_region ? *(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), + run_info.input_max_offset))) + : T(0); + + for (size_t m = 0; m < depth_multiplier; ++m) + { + const auto weights_val = + *(reinterpret_cast(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m)); + } + + offs += dilation.x() * run_info.input_stride_y; } - offs += dilation.x() * run_info.input_stride_y; + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - if(has_biases) - { - for(size_t m = 0; m < depth_multiplier; ++m) + if (has_biases) { - const auto biases_val = *(reinterpret_cast(biases_it.ptr() + m * sizeof(T))); - *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val; + for (size_t m = 0; m < depth_multiplier; ++m) + { + const auto biases_val = *(reinterpret_cast(biases_it.ptr() + m * sizeof(T))); + *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val; + } } - } - else - { - for(size_t m = 0; m < depth_multiplier; ++m) + else { - *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = acc.at(m); + for (size_t m = 0; m < depth_multiplier; ++m) + { + *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = acc.at(m); + } } - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } template -void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void run_depthwise_float(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { PadStrideInfo conv_info = info.pad_stride_info; unsigned int depth_multiplier = info.depth_multiplier; Size2D dilation = info.dilation; - if(depth_multiplier == 1) + if (depth_multiplier == 1) { depthwise_loop_multiplier1_fp(src, weights, biases, dst, conv_info, dilation, window, has_biases); } else { - depthwise_loop_generic_fp(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, has_biases); + depthwise_loop_generic_fp(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, + has_biases); } } template -void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info); +void run_depthwise_quanitized8bit(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp index 1bf7ad7007..d32847c1e8 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp @@ -26,16 +26,26 @@ namespace arm_compute { namespace cpu { -void neon_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_qu8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_quanitized8bit(src, weights, bias, dst, window, has_biases, info); } -void neon_qp8_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_qp8_qu8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_quanitized8bit(src, weights, bias, dst, window, has_biases, info); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp index 58f7536064..682fad0bda 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp @@ -26,16 +26,26 @@ namespace arm_compute { namespace cpu { -void neon_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_qs8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_quanitized8bit(src, weights, bias, dst, window, has_biases, info); } -void neon_qp8_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_qp8_qs8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_quanitized8bit(src, weights, bias, dst, window, has_biases, info); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/list.h b/src/cpu/kernels/depthwiseconv2d/list.h index 44f055d6a9..cf80608f4f 100644 --- a/src/cpu/kernels/depthwiseconv2d/list.h +++ b/src/cpu/kernels/depthwiseconv2d/list.h @@ -27,9 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name) \ - void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, \ - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name) \ + void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, \ + const Window &window, bool has_biases, const ConvolutionInfo &info) DECLARE_DEPTHWISECONV2D_KERNEL(neon_qu8_deptwiseconv2dnative); DECLARE_DEPTHWISECONV2D_KERNEL(neon_qs8_deptwiseconv2dnative); DECLARE_DEPTHWISECONV2D_KERNEL(neon_fp16_deptwiseconv2dnative); diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h index 9a0472643d..5cbf7a36c6 100644 --- a/src/cpu/kernels/directconv2d/list.h +++ b/src/cpu/kernels/directconv2d/list.h @@ -32,8 +32,9 @@ namespace cpu { namespace kernels { -#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \ - void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \ + void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, \ + const PadStrideInfo &conv_info) DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d); DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d); diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp index a719fa50d6..218a4b7ee4 100644 --- a/src/cpu/kernels/directconv2d/nchw/all.cpp +++ b/src/cpu/kernels/directconv2d/nchw/all.cpp @@ -22,18 +22,17 @@ * SOFTWARE. */ -#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" - -#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" -#include "src/core/NEON/wrapper/wrapper.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" #include @@ -44,22 +43,26 @@ namespace cpu namespace kernels { template -void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +void convolve_nchw( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -void neon_fp16_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +void neon_fp16_nchw_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { convolve_nchw(window, src, weights, dst, conv_info); } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -void neon_fp32_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +void neon_fp32_nchw_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { convolve_nchw(window, src, weights, dst, conv_info); } template -void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +void convolve_nchw( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { ARM_COMPUTE_UNUSED(conv_info); @@ -107,72 +110,81 @@ void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weig constexpr int num_elems_read_per_iteration = 16 / sizeof(T); - execute_window_loop(window_out, [&](const Coordinates & id) - { - // We are computing the theoretical starting input starting points - const int in_w_start_t = static_cast(id.x()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast(id.y()) * conv_stride_h - conv_pad_top; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - - // We use the input points to select the valid weight points to use - const int wei_w_start = in_w_start - in_w_start_t; - const int wei_h_start = in_h_start - in_h_start_t; - const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - - const int index_c_end = weights->info()->dimension(2); - const T *const in_ptr_start = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; - execute_window_loop(window_w, [&](const Coordinates & id_w) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - const T *const weights_ptr_start = reinterpret_cast(wei.ptr()); - uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; - T out_temp = static_cast(0); - - for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c) - { - const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c; - const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c; - for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast(id.x()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast(id.y()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + const int index_c_end = weights->info()->dimension(2); + const T *const in_ptr_start = + reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n; + execute_window_loop( + window_w, + [&](const Coordinates &id_w) { - const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h; - const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h; - int index_w = in_w_start; - int index_wei_w = wei_w_start; - vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); - for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration) - { - const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w); - const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w); - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_w < in_w_end; ++index_w, ++index_wei_w) + const T *const weights_ptr_start = reinterpret_cast(wei.ptr()); + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + T out_temp = static_cast(0); + + for (int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c) { - const auto src_val = *(in_ptr_row + index_w * input_stride_w); - const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w); - out_temp += src_val * w_val; + const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c; + const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c; + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) + { + const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h; + const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h; + int index_w = in_w_start; + int index_wei_w = wei_w_start; + vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); + for (; index_w <= ((in_w_end - num_elems_read_per_iteration)); + index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w); + const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_w < in_w_end; ++index_w, ++index_wei_w) + { + const auto src_val = *(in_ptr_row + index_w * input_stride_w); + const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w); + out_temp += src_val * w_val; + } + } } - } - } - *(reinterpret_cast(out_ptr)) = out_temp; - + *(reinterpret_cast(out_ptr)) = out_temp; + }, + wei); }, - wei); - }, - out); + out); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +template void convolve_nchw( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -template void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +template void convolve_nchw( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp index 9982431de5..36a8e76f13 100644 --- a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp +++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp @@ -30,10 +30,11 @@ namespace cpu { namespace kernels { -void neon_fp32_nhwc_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +void neon_fp32_nhwc_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { convolve_nhwc(window, src, weights, dst, conv_info); } } // namespace kernels } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp index 500ad1b420..f235167e28 100644 --- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp +++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp @@ -24,16 +24,16 @@ #include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" -#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" -#include "src/core/NEON/wrapper/wrapper.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" #include @@ -49,12 +49,14 @@ namespace { bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights) { - return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0); -} + return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && + weights->padding().right == 0); } +} // namespace template -void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +void convolve_nhwc( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { // Declare useful types using vtype = wrapper::traits::neon_bitvector; @@ -97,7 +99,7 @@ void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weig constexpr int num_elems_read_per_iteration = 16 / sizeof(T); // nhwc optimized - if(have_zero_x_internal_padding(src->info(), weights->info())) + if (have_zero_x_internal_padding(src->info(), weights->info())) { // This function assumes that input and weights have not padding in channel @@ -114,138 +116,154 @@ void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weig * multiplication works on the correct input/weight elements. */ execute_window_loop( - window_out, [&](const Coordinates & id) - { - /* + window_out, + [&](const Coordinates &id) + { + /* * In here we create theoretical indexes which then we validate for both * inputs and weights. * As a reminder, this loop take each output point in NHW, C is treated * in the weights loop. */ - // We are computing the theoretical starting input starting points - const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - - // We use the input points to select the valid weight points to use - const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w; - const int index_h_start = in_h_start - in_h_start_t; - const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w; - const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - - execute_window_loop( - window_w, [&](const Coordinates & id_w) - { - /* + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w; + const int index_h_start = in_h_start - in_h_start_t; + const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w; + const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + /* * This is the loop in the weights, and it goes along N (the batches) * As a reminder, the batches of the weights are translated into the * channels of the output */ - const T *in_ptr_row = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) - + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h; - const T *weights_ptr_row = reinterpret_cast(wei.ptr()) + index_h_start * kernel_stride_h; - uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; - - T out_temp = static_cast(0); - for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h) - { - const T *in_ptr_mover = in_ptr_row; - int index_wc = index_wc_start; - vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); - for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) - { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc); - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover) - { - const auto src_val = *(in_ptr_mover); - const auto w_val = *(weights_ptr_row + index_wc); - out_temp += src_val * w_val; - } - } - *(reinterpret_cast(out_ptr)) = out_temp; + const T *in_ptr_row = + reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h; + const T *weights_ptr_row = + reinterpret_cast(wei.ptr()) + index_h_start * kernel_stride_h; + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast(0); + for (int index_h = index_h_start; index_h < index_h_end; + ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h) + { + const T *in_ptr_mover = in_ptr_row; + int index_wc = index_wc_start; + vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); + for (; index_wc <= index_wc_end - num_elems_read_per_iteration; + index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_row + index_wc); + out_temp += src_val * w_val; + } + } + *(reinterpret_cast(out_ptr)) = out_temp; + }, + wei); }, - wei); - }, - out); + out); } else // nhwc non optimized { execute_window_loop( - window_out, [&](const Coordinates & id) - { - // We are computing the theoretical starting input starting points - const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - - // We use the input points to select the valid weight points to use - const int wei_w_start = in_w_start - in_w_start_t; - const int wei_h_start = in_h_start - in_h_start_t; - const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); - const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - - const int index_c_end = weights->info()->dimension(0); - const T *const in_ptr_start = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; - - execute_window_loop( - window_w, [&](const Coordinates & id_w) + window_out, + [&](const Coordinates &id) { - const T *const weights_ptr_start = reinterpret_cast(wei.ptr()); - uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; - - T out_temp = static_cast(0); - for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) - { - const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h; - const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h; - for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w) + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + const int index_c_end = weights->info()->dimension(0); + const T *const in_ptr_start = + reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n; + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) { - const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; - const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; - int index_c = 0; - vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); - for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration) - { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - const auto w_vec = wrapper::vloadq(weights_ptr_mover); - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover) + const T *const weights_ptr_start = reinterpret_cast(wei.ptr()); + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast(0); + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) { - const auto src_val = *(in_ptr_mover); - const auto w_val = *(weights_ptr_mover); - out_temp += src_val * w_val; + const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h; + for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; + ++index_wei_w, ++index_in_w) + { + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c = 0; + vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); + for (; index_c <= index_c_end - num_elems_read_per_iteration; + index_c += num_elems_read_per_iteration, + in_ptr_mover += num_elems_read_per_iteration, + weights_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_mover); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_mover); + out_temp += src_val * w_val; + } + } } - } - } - *(reinterpret_cast(out_ptr)) = out_temp; + *(reinterpret_cast(out_ptr)) = out_temp; + }, + wei); }, - wei); - }, - out); + out); } } -template void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +template void convolve_nhwc( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h index 3b26fcdf29..efb9ce8e2a 100644 --- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h +++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h @@ -26,6 +26,7 @@ #define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H #include "arm_compute/core/ITensor.h" + #include "src/core/helpers/WindowHelpers.h" namespace arm_compute @@ -35,7 +36,8 @@ namespace cpu namespace kernels { template -void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +void convolve_nhwc( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); } // namespace kernels } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp index 6091ef215e..9b4375f17c 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp @@ -23,6 +23,7 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" namespace arm_compute @@ -35,14 +36,38 @@ void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITenso return elementwise_arithm_op>(in1, in2, out, window); } -template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) @@ -50,12 +75,30 @@ void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor * return elementwise_comp_op_16(in1, in2, out, window); } -template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +} // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp index 2d8fec91c5..53ccd89dcc 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" namespace arm_compute @@ -34,25 +35,67 @@ void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITenso return elementwise_arithm_op>(in1, in2, out, window); } -template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comp_op_32(in1, in2, out, window); } -template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h index 98b154e8fd..98f7e8b949 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h +++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h @@ -39,7 +39,7 @@ typename VectorType::type elementwise_arithm_op(const typename VectorType::type vec_type res = wrapper::vdup_n(static_cast(0), tag_type{}); - switch(op) + switch (op) { case ArithmeticOperation::MAX: res = wrapper::vmax(a, b); @@ -71,7 +71,9 @@ typename VectorType::type elementwise_arithm_op(const typename VectorType::type } template -typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder) +typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, + const ScalarType &broadcast_value, + const bool reorder) { using tag_type = typename VectorType::tag_type; using vec_type = typename VectorType::type; @@ -81,10 +83,15 @@ typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorT } template -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), - int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), - int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)) +void elementwise_op( + const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)( + int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -99,7 +106,7 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const const auto window_end_x = static_cast(window.x().end()); const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -114,20 +121,26 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2); - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto a = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value); - } - }, - broadcast_input, non_broadcast_input, output); + auto output_ptr = reinterpret_cast(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast(broadcast_input.ptr()); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_value, output_ptr, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, + !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -139,21 +152,23 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr); - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto a = *(input1_ptr + x); - const auto b = *(input2_ptr + x); - *(output_ptr + x) = (*scalar_func)(a, b); - } - }, - input1, input2, output); + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr); + for (; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); } } @@ -162,7 +177,7 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar { auto res = ScalarType(0); - switch(op) + switch (op) { case ArithmeticOperation::MAX: res = std::max(a, b); @@ -183,10 +198,10 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar case ArithmeticOperation::DIV: { res = a / b; - if(std::is_integral::value) + if (std::is_integral::value) { res = (b == 0) ? 0 : res; - if(static_cast(a) % static_cast(b) != 0 && ((a < 0) != (b < 0))) + if (static_cast(a) % static_cast(b) != 0 && ((a < 0) != (b < 0))) { --res; } @@ -205,43 +220,56 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar } template <> -inline int32x4_t elementwise_arithm_op>(const int32x4_t &a, const int32x4_t &b) +inline int32x4_t +elementwise_arithm_op>(const int32x4_t &a, + const int32x4_t &b) { return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b)))); } template <> -inline float32x4_t elementwise_arithm_op>(const float32x4_t &a, const float32x4_t &b) +inline float32x4_t +elementwise_arithm_op>(const float32x4_t &a, + const float32x4_t &b) { return wrapper::vdiv(a, b); } template <> -inline float32x4_t elementwise_arithm_op>(const float32x4_t &a, const float32x4_t &b) +inline float32x4_t +elementwise_arithm_op>(const float32x4_t &a, + const float32x4_t &b) { return wrapper::vpow(a, b); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <> -inline float16x8_t elementwise_arithm_op>(const float16x8_t &a, const float16x8_t &b) +inline float16x8_t elementwise_arithm_op>( + const float16x8_t &a, const float16x8_t &b) { return wrapper::vdiv(a, b); } template <> -inline float16x8_t elementwise_arithm_op>(const float16x8_t &a, const float16x8_t &b) +inline float16x8_t +elementwise_arithm_op>( + const float16x8_t &a, const float16x8_t &b) { return wrapper::vpow(a, b); } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template -inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr) +inline int elementwise_arithm_op_loop(int window_start_x, + int window_end_x, + int window_step_x, + const ScalarType *input1_ptr, + const ScalarType *input2_ptr, + ScalarType *output_ptr) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const auto a = wrapper::vloadq(input1_ptr + x); const auto b = wrapper::vloadq(input2_ptr + x); @@ -251,14 +279,20 @@ inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int } template -inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder) +inline int elementwise_arithm_op_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const ScalarType *non_broadcast_input_ptr, + const ScalarType &broadcast_value, + ScalarType *output_ptr, + const bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); - wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast(a, broadcast_value, reorder)); + wrapper::vstore(output_ptr + x, + elementwise_arithm_op_broadcast(a, broadcast_value, reorder)); } return x; } @@ -268,10 +302,10 @@ void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, { using scalar_type = typename VectorType::scalar_type; - elementwise_op(in1, in2, out, window, - &elementwise_arithm_op_scalar, - &elementwise_arithm_op_broadcast_loop, - &elementwise_arithm_op_loop); + elementwise_op( + in1, in2, out, window, &elementwise_arithm_op_scalar, + &elementwise_arithm_op_broadcast_loop, + &elementwise_arithm_op_loop); } template @@ -279,7 +313,7 @@ inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputS { bool res = false; - switch(op) + switch (op) { case ComparisonOperation::Equal: res = (a == b); @@ -308,9 +342,9 @@ inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputS template inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b) { - OutputVectorType res = { 0, 0, 0, 0 }; + OutputVectorType res = {0, 0, 0, 0}; - switch(op) + switch (op) { case ComparisonOperation::Equal: res = wrapper::vceq(a, b); @@ -338,53 +372,75 @@ inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const Inpu } template -inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder) +inline OutputVectorType +elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder) { InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); - return elementwise_comp_op(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); + return elementwise_comp_op(reorder ? broadcast_vector : a, + reorder ? a : broadcast_vector); } template -inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *non_broadcast_input_ptr, + const InputScalarType &broadcast_value, + uint8_t *output_ptr, + const bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + const auto a = elementwise_comp_op_broadcast( + wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); wrapper::vstore(output_ptr + x, a); } return x; } template -inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *non_broadcast_input_ptr, + const InputScalarType &broadcast_value, + uint8_t *output_ptr, + const bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + const auto a = elementwise_comp_op_broadcast( + wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); wrapper::vstore(output_ptr + x, wrapper::vmovn(a)); } return x; } template -inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *non_broadcast_input_ptr, + const InputScalarType &broadcast_value, + uint8_t *output_ptr, + const bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder); - const auto b = elementwise_comp_op_broadcast(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder); + const auto a = elementwise_comp_op_broadcast( + wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder); + const auto b = elementwise_comp_op_broadcast( + wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder); wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b)))); } - if(x <= window_end_x - 4) + if (x <= window_end_x - 4) { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); - for(int i = 0; i < 4; i++) + const auto a = elementwise_comp_op_broadcast( + wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + for (int i = 0; i < 4; i++) { *(output_ptr + x + i) = wrapper::vgetlane(a, i); } @@ -394,11 +450,15 @@ inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_ } template -inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +inline int elementwise_comp_op_8_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *input1_ptr, + const InputScalarType *input2_ptr, + uint8_t *output_ptr) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const auto a = wrapper::vloadq(input1_ptr + x); const auto b = wrapper::vloadq(input2_ptr + x); @@ -409,11 +469,15 @@ inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int } template -inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +inline int elementwise_comp_op_16_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *input1_ptr, + const InputScalarType *input2_ptr, + uint8_t *output_ptr) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const auto a = wrapper::vloadq(input1_ptr + x); const auto b = wrapper::vloadq(input2_ptr + x); @@ -424,11 +488,15 @@ inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int } template -inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +inline int elementwise_comp_op_32_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *input1_ptr, + const InputScalarType *input2_ptr, + uint8_t *output_ptr) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { auto a = wrapper::vloadq(input1_ptr + x); auto b = wrapper::vloadq(input2_ptr + x); @@ -438,12 +506,12 @@ inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int const auto res2 = elementwise_comp_op(a, b); wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2)))); } - if(x <= window_end_x - 4) + if (x <= window_end_x - 4) { const auto a = wrapper::vloadq(input1_ptr + x); const auto b = wrapper::vloadq(input2_ptr + x); const auto res = elementwise_comp_op(a, b); - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { *(output_ptr + x + i) = wrapper::vgetlane(res, i); } @@ -455,57 +523,59 @@ inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int template void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op(in1, in2, out, window, - &elementwise_comp_op_scalar, - &elementwise_comp_op_broadcast_8_loop, - &elementwise_comp_op_8_loop); + elementwise_op( + in1, in2, out, window, &elementwise_comp_op_scalar, + &elementwise_comp_op_broadcast_8_loop, + &elementwise_comp_op_8_loop); } template void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op(in1, in2, out, window, - &elementwise_comp_op_scalar, - &elementwise_comp_op_broadcast_16_loop, - &elementwise_comp_op_16_loop); + elementwise_op( + in1, in2, out, window, &elementwise_comp_op_scalar, + &elementwise_comp_op_broadcast_16_loop, + &elementwise_comp_op_16_loop); } template void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op(in1, in2, out, window, - &elementwise_comp_op_scalar, - &elementwise_comp_op_broadcast_32_loop, - &elementwise_comp_op_32_loop); + elementwise_op( + in1, in2, out, window, &elementwise_comp_op_scalar, + &elementwise_comp_op_broadcast_32_loop, + &elementwise_comp_op_32_loop); } inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) { - qasymm8x16_t x = vld1q_u8(input1_ptr); - const float32x4x4_t out = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), - } - }; + qasymm8x16_t x = vld1q_u8(input1_ptr); + const float32x4x4_t out = {{ + vmulq_f32( + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + }}; return out; } inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) { - qasymm8x16_signed_t x = vld1q_s8(input1_ptr); - const float32x4x4_t out = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), - } - }; + qasymm8x16_signed_t x = vld1q_s8(input1_ptr); + const float32x4x4_t out = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + }}; return out; } @@ -523,17 +593,15 @@ inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out) vst1q_u8(output_ptr, vcombine_u8(pa, pb)); } -inline void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +inline void +store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) { - int32x4x4_t out = - { - { - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), - } - }; + int32x4x4_t out = {{ + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + }}; store_quantized(output_ptr, out); } @@ -544,17 +612,17 @@ inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) vst1q_s8(output_ptr, vcombine_s8(pa, pb)); } -inline void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +inline void store_quantized_signed(int8_t *output_ptr, + const float32x4x4_t &rf, + const float32x4_t &offset, + const float32x4_t &invscale) { - int32x4x4_t out = - { - { - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), - } - }; + int32x4x4_t out = {{ + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + }}; store_quantized_signed(output_ptr, out); } @@ -565,7 +633,8 @@ inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const floa } template -inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +inline int8_t +elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) { return quantize_qasymm8_signed(elementwise_arithm_op_scalar(a, b), qinfo); } @@ -574,15 +643,12 @@ template float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b) { using neon_vector_float = wrapper::traits::neon_vector; - float32x4x4_t out = - { - { - elementwise_arithm_op(a.val[0], b.val[0]), - elementwise_arithm_op(a.val[1], b.val[1]), - elementwise_arithm_op(a.val[2], b.val[2]), - elementwise_arithm_op(a.val[3], b.val[3]), - } - }; + float32x4x4_t out = {{ + elementwise_arithm_op(a.val[0], b.val[0]), + elementwise_arithm_op(a.val[1], b.val[1]), + elementwise_arithm_op(a.val[2], b.val[2]), + elementwise_arithm_op(a.val[3], b.val[3]), + }}; return out; } @@ -596,26 +662,29 @@ inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float template inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b) { - uint32x4x4_t out = - { - { - elementwise_comp_op(a.val[0], b.val[0]), - elementwise_comp_op(a.val[1], b.val[1]), - elementwise_comp_op(a.val[2], b.val[2]), - elementwise_comp_op(a.val[3], b.val[3]) - } - }; + uint32x4x4_t out = {{elementwise_comp_op(a.val[0], b.val[0]), + elementwise_comp_op(a.val[1], b.val[1]), + elementwise_comp_op(a.val[2], b.val[2]), + elementwise_comp_op(a.val[3], b.val[3])}}; return out; } template -inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) +inline int elementwise_arithm_op_quantized_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *input1_ptr, + const uint8_t *input2_ptr, + uint8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { // Get inputs and compute output const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); @@ -627,13 +696,21 @@ inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_e } template -inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) +inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *input1_ptr, + const int8_t *input2_ptr, + int8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { // Get inputs and compute output const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); @@ -645,45 +722,71 @@ inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int w } template -inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const float32x4x4_t rf = elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + const float32x4x4_t af = + load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = + elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); store_quantized(output_ptr + x, rf, voffseto, invvscaleo); } return x; } template -inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + int8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const float32x4x4_t rf = elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + const float32x4x4_t af = + load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = + elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); } return x; } template -inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) +inline int elementwise_comp_op_quantized_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *input1_ptr, + const uint8_t *input2_ptr, + uint8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) { ARM_COMPUTE_UNUSED(voffseto, invvscaleo); int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); @@ -694,14 +797,22 @@ inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end } template -inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) +inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *input1_ptr, + const int8_t *input2_ptr, + uint8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) { ARM_COMPUTE_UNUSED(voffseto, invvscaleo); int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); @@ -712,46 +823,85 @@ inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int win } template -inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) { ARM_COMPUTE_UNUSED(voffseto, invvscaleo); int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const uint32x4x4_t rf = elementwise_comp_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + const float32x4x4_t af = + load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const uint32x4x4_t rf = + elementwise_comp_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); store_quantized(output_ptr + x, rf); } return x; } template -inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) { ARM_COMPUTE_UNUSED(voffseto, invvscaleo); int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const uint32x4x4_t rf = elementwise_comp_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + const float32x4x4_t af = + load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const uint32x4x4_t rf = + elementwise_comp_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); store_quantized(output_ptr + x, rf); } return x; } -inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, +inline void elementwise_op_quantized(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) + int (*broadcast_func)(int, + int, + int, + const uint8_t *, + float32x4x4_t, + uint8_t *, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + const bool), + int (*neon_func)(int, + int, + int, + const uint8_t *, + const uint8_t *, + uint8_t *, + int32x4_t, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + float32x4_t)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -772,7 +922,7 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f); const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { // Select the broadcast input on the X axis const bool is_broadcast_input_2 = input2_win.x().step() == 0; @@ -794,24 +944,28 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo); + const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo); - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, + voffseto, invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, + !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -834,32 +988,56 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, + voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); } } -inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) +inline void +elementwise_comp_quantized_signed(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, + int, + int, + const int8_t *, + float32x4x4_t, + uint8_t *, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + const bool), + int (*neon_func)(int, + int, + int, + const int8_t *, + const int8_t *, + uint8_t *, + int32x4_t, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + float32x4_t)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -879,7 +1057,7 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { // Select the broadcast input on the X axis const bool is_broadcast_input_2 = input2_win.x().step() == 0; @@ -901,24 +1079,28 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); + const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, + voffseto, invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, + !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -941,32 +1123,56 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, + voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); } } -inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) +inline void +elementwise_op_quantized_signed(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, + int, + int, + const int8_t *, + float32x4x4_t, + int8_t *, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + const bool), + int (*neon_func)(int, + int, + int, + const int8_t *, + const int8_t *, + int8_t *, + int32x4_t, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + float32x4_t)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -986,7 +1192,7 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { // Select the broadcast input on the X axis const bool is_broadcast_input_2 = input2_win.x().step() == 0; @@ -1008,24 +1214,28 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); + const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, + voffseto, invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, + !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -1048,22 +1258,24 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, + voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); } } diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp index c5c528d3f3..09ad13d5eb 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" namespace arm_compute { @@ -33,63 +34,165 @@ void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor return elementwise_arithm_op>(in1, in2, out, window); } -template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_arithm_op>(in1, in2, out, window); } -template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comp_op_8(in1, in2, out, window); } -template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comp_op_16(in1, in2, out, window); } -template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comp_op_32(in1, in2, out, window); } -template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp index fa8e08745a..d891f70644 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" namespace arm_compute { @@ -33,27 +34,72 @@ void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITe return elementwise_arithm_op_quantized(in1, in2, out, window); } -template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template -void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) { return elementwise_comp_op_quantized(in1, in2, out, window); } -template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp index abfdf93b75..b1f8e018f5 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" namespace arm_compute @@ -34,27 +35,70 @@ void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *i return elementwise_arithm_op_quantized_signed(in1, in2, out, window); } -template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template -void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) { return elementwise_comp_op_quantized_signed(in1, in2, out, window); } -template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp index 85224351df..600c7f1c05 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp @@ -25,6 +25,7 @@ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" namespace arm_compute { @@ -36,14 +37,38 @@ void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor return elementwise_arithmetic_op(in1, in2, out, op, window); } -template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) @@ -51,14 +76,32 @@ void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *i return elementwise_comparison_op(in1, in2, out, op, window); } -template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp index 2b479f76f1..832a966883 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" namespace arm_compute { @@ -34,26 +35,68 @@ void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor return elementwise_arithmetic_op(in1, in2, out, op, window); } -template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comparison_op(in1, in2, out, op, window); } -template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp index c0515f2abc..fa48407e9b 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp @@ -23,7 +23,9 @@ */ #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" + #include "src/core/NEON/SVEMath.h" + #include namespace arm_compute @@ -33,7 +35,8 @@ namespace cpu using namespace arm_compute::wrapper; template -void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) +void elementwise_arithmetic_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) { using VectorType = typename sve_vector::type; @@ -51,7 +54,7 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor * const auto window_end_x = static_cast(window.x().end()); const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -66,37 +69,40 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor * Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_vector = svdup_n(broadcast_value); - - int x = window_start_x; - - svbool_t pg = svwhilelt(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x); - VectorType res{}; + auto output_ptr = reinterpret_cast(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_vector = svdup_n(broadcast_value); - if(is_broadcast_input_2) - { - res = elementwise_arithmetic_op::type>(pg, non_broadcast_vector, broadcast_vector, op); - } - else + int x = window_start_x; + + svbool_t pg = svwhilelt(x, window_end_x); + do { - res = elementwise_arithmetic_op::type>(pg, broadcast_vector, non_broadcast_vector, op); - } - svst1(pg, output_ptr + x, res); - - x += svcnt(); - pg = svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x); + VectorType res{}; + + if (is_broadcast_input_2) + { + res = elementwise_arithmetic_op::type>(pg, non_broadcast_vector, + broadcast_vector, op); + } + else + { + res = elementwise_arithmetic_op::type>( + pg, broadcast_vector, non_broadcast_vector, op); + } + svst1(pg, output_ptr + x, res); + + x += svcnt(); + pg = svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -108,39 +114,46 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor * Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); - int x = window_start_x; + int x = window_start_x; - svbool_t pg = svwhilelt(x, window_end_x); - do - { - const auto in1 = svld1(pg, input1_ptr + x); - const auto in2 = svld1(pg, input2_ptr + x); - const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, op); - svst1(pg, output_ptr + x, res); - - x += svcnt(); - pg = svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + svbool_t pg = svwhilelt(x, window_end_x); + do + { + const auto in1 = svld1(pg, input1_ptr + x); + const auto in2 = svld1(pg, input2_ptr + x); + const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, op); + svst1(pg, output_ptr + x, res); + + x += svcnt(); + pg = svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); +template void elementwise_arithmetic_op( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); +template void elementwise_arithmetic_op( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); +template void elementwise_arithmetic_op( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); +template void elementwise_arithmetic_op( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); template -void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) +void elementwise_comparison_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) { - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), + "input data type's width should be equal to or greater than output data type's width"); using OutputVectorType = typename sve_vector::type; const auto all_true_pg = svptrue(); @@ -157,7 +170,7 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor * const auto window_end_x = static_cast(window.x().end()); const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -172,37 +185,44 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor * Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_vector = svdup_n(broadcast_value); + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_vector = svdup_n(broadcast_value); - int x = window_start_x; + int x = window_start_x; - svbool_t pg = svwhilelt(x, window_end_x); - do - { - const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x); - const svbool_t output_pg = narrow_to_byte_predicate(pg); - OutputVectorType res{}; - if(is_broadcast_input_2) - { - res = elementwise_comparison_op::type, typename sve_vector::type>(pg, non_broadcast_vector, broadcast_vector, op); - } - else + svbool_t pg = svwhilelt(x, window_end_x); + do { - res = elementwise_comparison_op::type, typename sve_vector::type>(pg, broadcast_vector, non_broadcast_vector, op); - } - svst1(output_pg, output_ptr + x, res); - - x += svcnt(); - pg = svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x); + const svbool_t output_pg = narrow_to_byte_predicate(pg); + OutputVectorType res{}; + if (is_broadcast_input_2) + { + res = elementwise_comparison_op::type, + typename sve_vector::type>( + pg, non_broadcast_vector, broadcast_vector, op); + } + else + { + res = elementwise_comparison_op::type, + typename sve_vector::type>( + pg, broadcast_vector, non_broadcast_vector, op); + } + svst1(output_pg, output_ptr + x, res); + + x += svcnt(); + pg = svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -214,37 +234,45 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor * Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); - int x = window_start_x; + int x = window_start_x; - svbool_t pg = svwhilelt(x, window_end_x); - do - { - const auto in1 = svld1(pg, input1_ptr + x); - const auto in2 = svld1(pg, input2_ptr + x); - const auto res = elementwise_comparison_op::type, typename sve_vector::type>(pg, in1, in2, op); - const svbool_t output_pg = narrow_to_byte_predicate(pg); - svst1(output_pg, output_ptr + x, res); - - x += svcnt(); - pg = svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + svbool_t pg = svwhilelt(x, window_end_x); + do + { + const auto in1 = svld1(pg, input1_ptr + x); + const auto in2 = svld1(pg, input2_ptr + x); + const auto res = + elementwise_comparison_op::type, + typename sve_vector::type>(pg, in1, in2, op); + const svbool_t output_pg = narrow_to_byte_predicate(pg); + svst1(output_pg, output_ptr + x, res); + + x += svcnt(); + pg = svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); template <> svint32_t elementwise_pow(svbool_t &pg, const svint32_t &a, const svint32_t &b) diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h index 860c50a1e0..4c61b9f315 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h +++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h @@ -25,6 +25,7 @@ #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/core/NEON/wrapper/svtraits.h" @@ -51,7 +52,7 @@ svbool_t narrow_to_byte_predicate(svbool_t pg) { const auto all_false = svpfalse(); - switch(bytewidth) + switch (bytewidth) { case 8: pg = svuzp1_b32(pg, all_false); @@ -74,7 +75,7 @@ VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const Ve using ScalarType = typename wrapper::sve_scalar::type; VectorType res{}; - switch(op) + switch (op) { case ArithmeticOperation::MAX: res = svmax_z(pg, a, b); @@ -114,11 +115,12 @@ VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const Ve } template -OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) +OutputVectorType +elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) { svbool_t selection_vector{}; - switch(op) + switch (op) { case ComparisonOperation::Equal: selection_vector = svcmpeq(pg, a, b); @@ -154,10 +156,12 @@ OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType & } template -void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window); +void elementwise_arithmetic_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window); template -void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window); +void elementwise_comparison_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window); } // namespace cpu } // namespace arm_compute #endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */ diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp index c313fc6e04..f7714ff7e9 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" namespace arm_compute { @@ -33,64 +34,166 @@ void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor { return elementwise_arithmetic_op(in1, in2, out, op, window); } -template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_arithmetic_op(in1, in2, out, op, window); } -template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comparison_op(in1, in2, out, op, window); } -template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comparison_op(in1, in2, out, op, window); } -template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comparison_op(in1, in2, out, op, window); } -template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h index 41e0ac77db..7c6015d379 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h @@ -35,19 +35,14 @@ inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint3 { auto x = svld1(pg, ptr); - const auto widened = svcreate4( - svmovlb(svmovlb(x)), - svmovlt(svmovlb(x)), - svmovlb(svmovlt(x)), - svmovlt(svmovlt(x))); + const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x))); pg = svptrue_b8(); - return svcreate4( - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale)); + return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale)); } inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) @@ -56,28 +51,24 @@ inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint //vprint(x); - const auto widened = svcreate4( - svmovlb(svmovlb(x)), - svmovlt(svmovlb(x)), - svmovlb(svmovlt(x)), - svmovlt(svmovlt(x))); + const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x))); pg = svptrue_b8(); - return svcreate4( - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale)); + return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale)); } -inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +inline void +store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) { - const auto quantized = svcreate4( - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); + const auto quantized = + svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1)); const auto narrowed_top = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3)); @@ -85,13 +76,14 @@ inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svst1(pg, ptr, narrowed); } -inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +inline void +store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) { - const auto quantized = svcreate4( - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); + const auto quantized = + svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1)); const auto narrowed_top = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3)); @@ -101,7 +93,8 @@ inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const } template -void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) +void elementwise_arithmetic_quantized_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) { const auto all_true_pg = wrapper::svptrue(); @@ -120,7 +113,7 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset); const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -128,8 +121,10 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); - const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); + const auto non_broadcast_qinfo = + is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); + const auto broadcast_qinfo = + is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); @@ -141,48 +136,52 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float broadcast_value_f = Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); - const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); - - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); - - svfloat32x4_t result{}; - - if(!is_broadcast_input_2) + auto output_ptr = reinterpret_cast(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float broadcast_value_f = + Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); + const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), + svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do { - result = svcreate4( - elementwise_arithmetic_op(pg, svget4(in2, 0), svget4(in1, 0), op), - elementwise_arithmetic_op(pg, svget4(in2, 1), svget4(in1, 1), op), - elementwise_arithmetic_op(pg, svget4(in2, 2), svget4(in1, 2), op), - elementwise_arithmetic_op(pg, svget4(in2, 3), svget4(in1, 3), op)); - } - else - { - result = svcreate4( - elementwise_arithmetic_op(pg, svget4(in1, 0), svget4(in2, 0), op), - elementwise_arithmetic_op(pg, svget4(in1, 1), svget4(in2, 1), op), - elementwise_arithmetic_op(pg, svget4(in1, 2), svget4(in2, 2), op), - elementwise_arithmetic_op(pg, svget4(in1, 3), svget4(in2, 3), op)); - } - - store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + const auto in1 = + load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); + + svfloat32x4_t result{}; + + if (!is_broadcast_input_2) + { + result = + svcreate4(elementwise_arithmetic_op(pg, svget4(in2, 0), svget4(in1, 0), op), + elementwise_arithmetic_op(pg, svget4(in2, 1), svget4(in1, 1), op), + elementwise_arithmetic_op(pg, svget4(in2, 2), svget4(in1, 2), op), + elementwise_arithmetic_op(pg, svget4(in2, 3), svget4(in1, 3), op)); + } + else + { + result = + svcreate4(elementwise_arithmetic_op(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_arithmetic_op(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_arithmetic_op(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_arithmetic_op(pg, svget4(in1, 3), svget4(in2, 3), op)); + } + + store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -200,41 +199,44 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); - int x = window_start_x; + int x = window_start_x; - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); - const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); - - const auto result = svcreate4( - elementwise_arithmetic_op(pg, svget4(in1, 0), svget4(in2, 0), op), - elementwise_arithmetic_op(pg, svget4(in1, 1), svget4(in2, 1), op), - elementwise_arithmetic_op(pg, svget4(in1, 2), svget4(in2, 2), op), - elementwise_arithmetic_op(pg, svget4(in1, 3), svget4(in2, 3), op)); - - store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); + const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); + + const auto result = + svcreate4(elementwise_arithmetic_op(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_arithmetic_op(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_arithmetic_op(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_arithmetic_op(pg, svget4(in1, 3), svget4(in2, 3), op)); + + store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } template -void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) +void elementwise_comparison_quantized_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) { - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), + "input data type's width should be equal to or greater than output data type's width"); using OutputVectorType = typename wrapper::traits::sve_vector::type; const auto all_true_pg = wrapper::svptrue(); @@ -251,7 +253,7 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, const auto window_end_x = static_cast(window.x().end()); const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -259,8 +261,10 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); - const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); + const auto non_broadcast_qinfo = + is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); + const auto broadcast_qinfo = + is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); @@ -272,51 +276,63 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float broadcast_value_f = Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); - const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); - - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); - - svuint8x4_t result{}; - - if(!is_broadcast_input_2) + auto output_ptr = reinterpret_cast(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast(broadcast_input.ptr()); + const float broadcast_value_f = + Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); + const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), + svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do { - result = svcreate4( - elementwise_comparison_op(pg, svget4(in2, 0), svget4(in1, 0), op), - elementwise_comparison_op(pg, svget4(in2, 1), svget4(in1, 1), op), - elementwise_comparison_op(pg, svget4(in2, 2), svget4(in1, 2), op), - elementwise_comparison_op(pg, svget4(in2, 3), svget4(in1, 3), op)); - } - else - { - result = svcreate4( - elementwise_comparison_op(pg, svget4(in1, 0), svget4(in2, 0), op), - elementwise_comparison_op(pg, svget4(in1, 1), svget4(in2, 1), op), - elementwise_comparison_op(pg, svget4(in1, 2), svget4(in2, 2), op), - elementwise_comparison_op(pg, svget4(in1, 3), svget4(in2, 3), op)); - } - - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, output_ptr + x, zipped); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + const auto in1 = + load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); + + svuint8x4_t result{}; + + if (!is_broadcast_input_2) + { + result = svcreate4(elementwise_comparison_op(pg, svget4(in2, 0), + svget4(in1, 0), op), + elementwise_comparison_op(pg, svget4(in2, 1), + svget4(in1, 1), op), + elementwise_comparison_op(pg, svget4(in2, 2), + svget4(in1, 2), op), + elementwise_comparison_op( + pg, svget4(in2, 3), svget4(in1, 3), op)); + } + else + { + result = svcreate4(elementwise_comparison_op(pg, svget4(in1, 0), + svget4(in2, 0), op), + elementwise_comparison_op(pg, svget4(in1, 1), + svget4(in2, 1), op), + elementwise_comparison_op(pg, svget4(in1, 2), + svget4(in2, 2), op), + elementwise_comparison_op( + pg, svget4(in1, 3), svget4(in2, 3), op)); + } + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, output_ptr + x, zipped); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -334,39 +350,44 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); - int x = window_start_x; + int x = window_start_x; - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); - const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); - const auto result = svcreate4( - elementwise_comparison_op(pg, svget4(in1, 0), svget4(in2, 0), op), - elementwise_comparison_op(pg, svget4(in1, 1), svget4(in2, 1), op), - elementwise_comparison_op(pg, svget4(in1, 2), svget4(in2, 2), op), - elementwise_comparison_op(pg, svget4(in1, 3), svget4(in2, 3), op)); - - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, output_ptr + x, zipped); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); + const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); + const auto result = + svcreate4(elementwise_comparison_op(pg, svget4(in1, 0), + svget4(in2, 0), op), + elementwise_comparison_op(pg, svget4(in1, 1), + svget4(in2, 1), op), + elementwise_comparison_op(pg, svget4(in1, 2), + svget4(in2, 2), op), + elementwise_comparison_op(pg, svget4(in1, 3), + svget4(in2, 3), op)); + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, output_ptr + x, zipped); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ \ No newline at end of file +#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp index 7435bb4f29..5cc66642d7 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h" namespace arm_compute { @@ -34,27 +35,72 @@ void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITe return elementwise_arithmetic_quantized_op(in1, in2, out, op, window); } -template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template -void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) { return elementwise_comparison_quantized_op(in1, in2, out, op, window); } -template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp index 1027a1eed0..165e0c05fa 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h" namespace arm_compute { @@ -34,27 +35,70 @@ void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *i return elementwise_arithmetic_quantized_op(in1, in2, out, op, window); } -template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template -void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) { return elementwise_comparison_quantized_op(in1, in2, out, op, window); } -template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp index b2833c2481..2588db024d 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp @@ -23,17 +23,19 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_fp16_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op<__fp16>(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp index 6566821eca..936a2e588a 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp @@ -22,16 +22,18 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_fp32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h index dbc1dde4fa..d54d3984cb 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h +++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" + #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" @@ -36,7 +37,7 @@ namespace cpu template inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a) { - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: return 1 / sqrt(a); @@ -60,7 +61,7 @@ inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarTyp template inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a) { - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: return wrapper::vinvsqrt(a); @@ -94,22 +95,24 @@ inline void elementwise_op(const ITensor *in, ITensor *out, const Window &window Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input_ptr = reinterpret_cast(input.ptr()); - - int x = window_start_x; - for(; x <= window_end_x - window_step_x; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - wrapper::vstore(output_ptr + x, elementwise_op_imp(op, wrapper::vloadq(input_ptr + x))); - } - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x)); - } - }, - input, output); + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input_ptr = reinterpret_cast(input.ptr()); + + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(output_ptr + x, elementwise_op_imp(op, wrapper::vloadq(input_ptr + x))); + } + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x)); + } + }, + input, output); } template <> @@ -128,75 +131,81 @@ inline void elementwise_op(const ITensor *in, ITensor *out, const Window Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - int8x16_t vout; - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto vconst_0_f32 = vdupq_n_f32(0); - auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; - - int x = window_start_x; - for(; x <= window_end_x - window_step_x; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(input_ptr + x); - - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); + int8x16_t vout; + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto vconst_0_f32 = vdupq_n_f32(0); + auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; - // Perform activation - float32x4x4_t vtmp_deq = + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) { - { + const auto vin = wrapper::vloadq(input_ptr + x); + + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + + // Perform activation + float32x4x4_t vtmp_deq = {{ elementwise_op_imp(op, vin_deq.val[0]), elementwise_op_imp(op, vin_deq.val[1]), elementwise_op_imp(op, vin_deq.val[2]), elementwise_op_imp(op, vin_deq.val[3]), + }}; + + if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) + { + vtmp_deq.val[0] = + vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); + vtmp_deq.val[1] = + vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); + vtmp_deq.val[2] = + vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); + vtmp_deq.val[3] = + vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); } - }; - if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) - { - vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); - vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); - vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); - vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); + // Re-quantize to new output space + vout = vquantize_signed(vtmp_deq, qi_out); + wrapper::vstore(output_ptr + x, vout); } - - // Re-quantize to new output space - vout = vquantize_signed(vtmp_deq, qi_out); - wrapper::vstore(output_ptr + x, vout); - } - for(; x < window_end_x; ++x) - { - qasymm8_signed_t in = *(reinterpret_cast(input_ptr + x)); - qasymm8_signed_t tmp = 0; - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - if(tmp_f <= 0.0) + for (; x < window_end_x; ++x) { - if(op == ElementWiseUnary::LOG) - { - tmp_f = (-128 - qi_out.offset) * qi_out.scale; - } - else if(op == ElementWiseUnary::RSQRT) + qasymm8_signed_t in = *(reinterpret_cast(input_ptr + x)); + qasymm8_signed_t tmp = 0; + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + if (tmp_f <= 0.0) { - tmp_f = (127 - qi_out.offset) * qi_out.scale; + if (op == ElementWiseUnary::LOG) + { + tmp_f = (-128 - qi_out.offset) * qi_out.scale; + } + else if (op == ElementWiseUnary::RSQRT) + { + tmp_f = (127 - qi_out.offset) * qi_out.scale; + } + else + { + tmp_f = elementwise_op_scalar_imp(op, tmp_f); + } } else { tmp_f = elementwise_op_scalar_imp(op, tmp_f); } + tmp = quantize_qasymm8_signed( + tmp_f, qi_out, + RoundingPolicy:: + TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a. + // For aarch64 LUT is used and rounding to nearest is used + *(output_ptr + x) = tmp; } - else - { - tmp_f = elementwise_op_scalar_imp(op, tmp_f); - } - tmp = quantize_qasymm8_signed(tmp_f, qi_out, RoundingPolicy::TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a. - // For aarch64 LUT is used and rounding to nearest is used - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } template <> inline void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) @@ -215,71 +224,74 @@ inline void elementwise_op(const ITensor *in, ITensor *out, const Windo Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - uint8x16_t vout; - auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input_ptr = reinterpret_cast(input.ptr()); - int x = window_start_x; - for(; x <= window_end_x - window_step_x; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(input_ptr + x); + uint8x16_t vout; + auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input_ptr = reinterpret_cast(input.ptr()); + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - float32x4x4_t vtmp_deq = - { - { + // Perform activation + float32x4x4_t vtmp_deq = {{ elementwise_op_imp(op, vin_deq.val[0]), elementwise_op_imp(op, vin_deq.val[1]), elementwise_op_imp(op, vin_deq.val[2]), elementwise_op_imp(op, vin_deq.val[3]), + }}; + if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) + { + vtmp_deq.val[0] = + vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); + vtmp_deq.val[1] = + vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); + vtmp_deq.val[2] = + vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); + vtmp_deq.val[3] = + vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); } - }; - if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) - { - vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); - vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); - vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); - vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); - } - // Re-quantize to new output space - vout = vquantize(vtmp_deq, qi_out); - wrapper::vstore(output_ptr + x, vout); - } - for(; x < window_end_x; ++x) - { - qasymm8_t in = *(reinterpret_cast(input_ptr + x)); - qasymm8_t tmp = 0; - float tmp_f = dequantize_qasymm8(in, qi_in); - if(tmp_f <= 0.0) + // Re-quantize to new output space + vout = vquantize(vtmp_deq, qi_out); + wrapper::vstore(output_ptr + x, vout); + } + for (; x < window_end_x; ++x) { - if(op == ElementWiseUnary::LOG) + qasymm8_t in = *(reinterpret_cast(input_ptr + x)); + qasymm8_t tmp = 0; + float tmp_f = dequantize_qasymm8(in, qi_in); + if (tmp_f <= 0.0) { - tmp_f = (0 - qi_out.offset) * qi_out.scale; - } - else if(op == ElementWiseUnary::RSQRT) - { - tmp_f = (255 - qi_out.offset) * qi_out.scale; + if (op == ElementWiseUnary::LOG) + { + tmp_f = (0 - qi_out.offset) * qi_out.scale; + } + else if (op == ElementWiseUnary::RSQRT) + { + tmp_f = (255 - qi_out.offset) * qi_out.scale; + } + else + { + tmp_f = elementwise_op_scalar_imp(op, tmp_f); + } } else { tmp_f = elementwise_op_scalar_imp(op, tmp_f); } + tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO); + *(output_ptr + x) = tmp; } - else - { - tmp_f = elementwise_op_scalar_imp(op, tmp_f); - } - tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO); - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } } // namespace cpu diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp index dfe5e30035..d4daad4ca6 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp @@ -22,16 +22,18 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_s32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp index 08bb7f28b6..38cb61d0ff 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/lut/list.h" namespace arm_compute @@ -32,24 +33,28 @@ namespace cpu #ifdef __aarch64__ -void neon_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_q8_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(op); - auto win = window; + auto win = window; const auto window_end_x = window.x().end(); win.set(0, Window::Dimension(0, 1, 1)); Iterator src_it(in, win); Iterator dst_it(out, win); - execute_window_loop(win, [&](const Coordinates &) { - const auto src_ptr = src_it.ptr(); - auto dst_ptr = dst_it.ptr(); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = src_it.ptr(); + auto dst_ptr = dst_it.ptr(); - lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr); - }, - src_it, dst_it); + lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr); + }, + src_it, dst_it); } #endif // __aarch64__ diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp index d987f7747b..3e4b88eb47 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Window.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute @@ -31,7 +32,8 @@ namespace cpu { #ifndef __aarch64__ // Fallback function to be used for armv7a, for aarch64 LUT is used -void neon_qasymm8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_qasymm8_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op(in, out, window, op); diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp index e00970a1e0..a5f4b053e3 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Window.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute @@ -31,7 +32,8 @@ namespace cpu { #ifndef __aarch64__ // Fallback function to be used for armv7a, for aarch64 LUT is used -void neon_qasymm8_signed_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_qasymm8_signed_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op(in, out, window, op); diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp index a883309b2e..22ff43c5d9 100644 --- a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp @@ -23,6 +23,7 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/CpuTypes.h" #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" @@ -30,11 +31,12 @@ namespace arm_compute { namespace cpu { -void sve_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void sve_fp16_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_sve_op(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp index b21ed8ddbc..394bd47adf 100644 --- a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/CpuTypes.h" #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" @@ -30,10 +31,11 @@ namespace arm_compute { namespace cpu { -void sve_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void sve_fp32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_sve_op(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp index a948862906..5af534d9e7 100644 --- a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" namespace arm_compute @@ -31,9 +32,10 @@ namespace arm_compute namespace cpu { template -inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +inline typename std::enable_if::value, VectorType>::type +elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) { - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: return svinvsqrt(pg, a); @@ -55,9 +57,10 @@ inline typename std::enable_if::val } template -inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +inline typename std::enable_if::value, VectorType>::type +elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) { - switch(op) + switch (op) { case ElementWiseUnary::NEG: return svneg_z(pg, a); @@ -81,23 +84,24 @@ void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, E Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input_ptr = reinterpret_cast(input.ptr()); - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto vin = svld1(pg, input_ptr + x); - svst1(pg, output_ptr + x, elementwise_op_sve_imp(pg, op, vin)); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input, output); + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input_ptr = reinterpret_cast(input.ptr()); + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto vin = svld1(pg, input_ptr + x); + svst1(pg, output_ptr + x, elementwise_op_sve_imp(pg, op, vin)); + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input, output); } template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp index 068c3f7cda..e27fe5a87f 100644 --- a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp @@ -23,16 +23,18 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void sve_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void sve_s32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_sve_op(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp index 7e32f50132..4e4582debb 100644 --- a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp @@ -23,13 +23,15 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/lut/list.h" namespace arm_compute { namespace cpu { -void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void sve2_q8_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(op); @@ -40,14 +42,16 @@ void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &wi Iterator src_it(in, win); Iterator dst_it(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = src_it.ptr(); - auto dst_ptr = dst_it.ptr(); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = src_it.ptr(); + auto dst_ptr = dst_it.ptr(); - lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr); - }, - src_it, dst_it); + lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr); + }, + src_it, dst_it); } } // namespace cpu diff --git a/src/cpu/kernels/floor/list.h b/src/cpu/kernels/floor/list.h index 4367e0ffc9..5ac78df324 100644 --- a/src/cpu/kernels/floor/list.h +++ b/src/cpu/kernels/floor/list.h @@ -28,8 +28,7 @@ namespace arm_compute { namespace cpu { -#define DECLARE_FLOOR_KERNEL(func_name) \ - void func_name(const void *src, void *dst, int len) +#define DECLARE_FLOOR_KERNEL(func_name) void func_name(const void *src, void *dst, int len) DECLARE_FLOOR_KERNEL(fp16_neon_floor); DECLARE_FLOOR_KERNEL(fp32_neon_floor); diff --git a/src/cpu/kernels/floor/neon/fp16.cpp b/src/cpu/kernels/floor/neon/fp16.cpp index f362676a36..f47690277d 100644 --- a/src/cpu/kernels/floor/neon/fp16.cpp +++ b/src/cpu/kernels/floor/neon/fp16.cpp @@ -45,14 +45,14 @@ void fp16_neon_floor(const void *src, void *dst, int len) auto psrc = static_cast(src); auto pdst = static_cast<__fp16 *>(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc))); psrc += step; pdst += step; } - for(; len > 0; --len) + for (; len > 0; --len) { *pdst = std::floor(*psrc); ++psrc; diff --git a/src/cpu/kernels/floor/neon/fp32.cpp b/src/cpu/kernels/floor/neon/fp32.cpp index f5efb2e849..a86e24d3c3 100644 --- a/src/cpu/kernels/floor/neon/fp32.cpp +++ b/src/cpu/kernels/floor/neon/fp32.cpp @@ -43,14 +43,14 @@ void fp32_neon_floor(const void *src, void *dst, int len) auto psrc = static_cast(src); auto pdst = static_cast(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc))); psrc += step; pdst += step; } - for(; len > 0; --len) + for (; len > 0; --len) { *pdst = std::floor(*psrc); ++pdst; diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp index a29ee762fc..2821af32ce 100644 --- a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp +++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp @@ -29,11 +29,19 @@ namespace arm_compute { namespace cpu { -void fused_batch_normalization_conv_f16(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_conv_f16(const ITensor *conv_weights, + const ITensor *conv_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_conv(conv_weights, conv_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_conv(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp index 076e97651d..3ca5b6977a 100644 --- a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp +++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp @@ -28,11 +28,19 @@ namespace arm_compute { namespace cpu { -void fused_batch_normalization_conv_f32(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_conv_f32(const ITensor *conv_weights, + const ITensor *conv_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_conv(conv_weights, conv_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_conv(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h index b9017600d6..6fa843263a 100644 --- a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h +++ b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -32,8 +33,16 @@ namespace arm_compute namespace cpu { template -void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_conv(const ITensor *conv_weights, + const ITensor *conv_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { using ScalarType = T; const int size = 16 / conv_weights->info()->element_size(); @@ -53,13 +62,20 @@ void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor * Iterator conv_w_in(conv_weights, win); Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win); - const auto conv_bias_in = (conv_bias != nullptr ? reinterpret_cast(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); - auto conv_bias_out = (run_in_place_bias ? conv_bias_in : reinterpret_cast(fused_bias->ptr_to_element(Coordinates(0, 0)))); + const auto conv_bias_in = + (conv_bias != nullptr ? reinterpret_cast(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); + auto conv_bias_out = + (run_in_place_bias ? conv_bias_in + : reinterpret_cast(fused_bias->ptr_to_element(Coordinates(0, 0)))); const auto input_mean = reinterpret_cast(bn_mean->ptr_to_element(Coordinates(0, 0))); const auto input_var = reinterpret_cast(bn_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (bn_beta != nullptr) ? reinterpret_cast(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_gamma = (bn_gamma != nullptr) + ? reinterpret_cast(bn_gamma->ptr_to_element(Coordinates(0, 0))) + : nullptr; + const auto input_beta = (bn_beta != nullptr) + ? reinterpret_cast(bn_beta->ptr_to_element(Coordinates(0, 0))) + : nullptr; auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); @@ -73,59 +89,61 @@ void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor * auto gamma = ScalarType(1.0); auto beta = ScalarType(0.0); auto conv_bias_in_scalar = ScalarType(0.0); - execute_window_loop(win, [&](const Coordinates & id) - { - var = input_var[id[3]]; - if(input_gamma != nullptr) + execute_window_loop( + win, + [&](const Coordinates &id) { - gamma = input_gamma[id[3]]; - } + var = input_var[id[3]]; + if (input_gamma != nullptr) + { + gamma = input_gamma[id[3]]; + } - if((id[0] == 0) && (id[1] == 0) && (id[2] == 0)) - { - if(input_beta != nullptr) + if ((id[0] == 0) && (id[1] == 0) && (id[2] == 0)) { - beta = input_beta[id[3]]; - beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + if (input_beta != nullptr) + { + beta = input_beta[id[3]]; + beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + } + + // Construct vectors + mean = input_mean[id[3]]; + mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + + if (conv_bias_in != nullptr) + { + conv_bias_in_scalar = conv_bias_in[id[3]]; + } + auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); + conv_bias_out[id[3]] = (conv_bias_tmp_scalar * gamma) + beta; } - // Construct vectors - mean = input_mean[id[3]]; - mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + int x = window_start_x; + auto conv_w_in_ptr = reinterpret_cast(conv_w_in.ptr()); + auto conv_w_out_ptr = reinterpret_cast(conv_w_out.ptr()); + var_vec = wrapper::vdup_n(var, ExactTagType{}); + gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - if(conv_bias_in != nullptr) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - conv_bias_in_scalar = conv_bias_in[id[3]]; - } - auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); - conv_bias_out[id[3]] = (conv_bias_tmp_scalar * gamma) + beta; - } - - int x = window_start_x; - auto conv_w_in_ptr = reinterpret_cast(conv_w_in.ptr()); - auto conv_w_out_ptr = reinterpret_cast(conv_w_out.ptr()); - var_vec = wrapper::vdup_n(var, ExactTagType{}); - gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); - rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto wn = wrapper::vloadq(conv_w_in_ptr + x); - wn = wrapper::vmul(wn, rvar_vec); - wn = wrapper::vmul(wn, gamma_vec); + auto wn = wrapper::vloadq(conv_w_in_ptr + x); + wn = wrapper::vmul(wn, rvar_vec); + wn = wrapper::vmul(wn, gamma_vec); - // Store results - wrapper::vstore(conv_w_out_ptr + x, wn); - } + // Store results + wrapper::vstore(conv_w_out_ptr + x, wn); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; - } - }, - conv_w_in, conv_w_out); -} -} + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; + } + }, + conv_w_in, conv_w_out); } +} // namespace cpu +} // namespace arm_compute #endif //SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H diff --git a/src/cpu/kernels/fuse_batch_normalization/list.h b/src/cpu/kernels/fuse_batch_normalization/list.h index e25b1e5fed..a03dd74f78 100644 --- a/src/cpu/kernels/fuse_batch_normalization/list.h +++ b/src/cpu/kernels/fuse_batch_normalization/list.h @@ -30,15 +30,18 @@ namespace cpu { #define DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(func_name) \ void func_name(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, \ - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) + const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \ + float epsilon, const Window &window) #define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(func_name) \ void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \ - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) + const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \ + float epsilon, const Window &window) #define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(func_name) \ void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \ - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) + const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \ + float epsilon, const Window &window) DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f16); DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f32); @@ -50,7 +53,7 @@ DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(fused_batch_normalization_dwc_ #undef DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL #undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL #undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL -} -} +} // namespace cpu +} // namespace arm_compute -#endif // \ No newline at end of file +#endif // diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp index 1e3be8792d..c0b0dfd4dc 100644 --- a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp +++ b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp @@ -29,8 +29,16 @@ namespace arm_compute namespace cpu { template -void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { using ScalarType = T; const int size = 16 / dwc_weights->info()->element_size(); @@ -50,13 +58,20 @@ void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITenso Iterator dwc_w_in(dwc_weights, win); Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win); - const auto dwc_bias_in = (dwc_bias != nullptr ? reinterpret_cast(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); - auto dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast(fused_bias->ptr_to_element(Coordinates(0, 0)))); + const auto dwc_bias_in = + (dwc_bias != nullptr ? reinterpret_cast(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); + auto dwc_bias_out = + (run_in_place_bias ? dwc_bias_in + : reinterpret_cast(fused_bias->ptr_to_element(Coordinates(0, 0)))); const auto input_mean = reinterpret_cast(bn_mean->ptr_to_element(Coordinates(0, 0))); const auto input_var = reinterpret_cast(bn_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (bn_beta != nullptr) ? reinterpret_cast(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_gamma = (bn_gamma != nullptr) + ? reinterpret_cast(bn_gamma->ptr_to_element(Coordinates(0, 0))) + : nullptr; + const auto input_beta = (bn_beta != nullptr) + ? reinterpret_cast(bn_beta->ptr_to_element(Coordinates(0, 0))) + : nullptr; auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); @@ -70,74 +85,92 @@ void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITenso auto gamma = ScalarType(1.0); auto beta = ScalarType(0.0); auto dwc_bias_in_scalar = ScalarType(0.0); - execute_window_loop(win, [&](const Coordinates & id) - { - var = input_var[id[2]]; - if(input_gamma != nullptr) + execute_window_loop( + win, + [&](const Coordinates &id) { - gamma = input_gamma[id[2]]; - } - - if(id[1] == 0) - { - mean = input_mean[id[2]]; - - // Construct vectors - mean_vec = wrapper::vdup_n(mean, ExactTagType{}); - if(input_beta != nullptr) + var = input_var[id[2]]; + if (input_gamma != nullptr) { - beta = input_beta[id[2]]; - beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + gamma = input_gamma[id[2]]; } - if(dwc_bias_in != nullptr) + if (id[1] == 0) { - dwc_bias_in_scalar = dwc_bias_in[id[2]]; + mean = input_mean[id[2]]; + + // Construct vectors + mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + if (input_beta != nullptr) + { + beta = input_beta[id[2]]; + beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + } + + if (dwc_bias_in != nullptr) + { + dwc_bias_in_scalar = dwc_bias_in[id[2]]; + } + + auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); + dwc_bias_out[id[2]] = (dwc_bias_tmp_scalar * gamma) + beta; } - auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); - dwc_bias_out[id[2]] = (dwc_bias_tmp_scalar * gamma) + beta; - } + int x = window_start_x; + auto dwc_w_in_ptr = reinterpret_cast(dwc_w_in.ptr()); + auto dwc_w_out_ptr = reinterpret_cast(dwc_w_out.ptr()); + var_vec = wrapper::vdup_n(var, ExactTagType{}); + gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - int x = window_start_x; - auto dwc_w_in_ptr = reinterpret_cast(dwc_w_in.ptr()); - auto dwc_w_out_ptr = reinterpret_cast(dwc_w_out.ptr()); - var_vec = wrapper::vdup_n(var, ExactTagType{}); - gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); - rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto wn = wrapper::vloadq(dwc_w_in_ptr + x); - wn = wrapper::vmul(wn, rvar_vec); - wn = wrapper::vmul(wn, gamma_vec); + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto wn = wrapper::vloadq(dwc_w_in_ptr + x); + wn = wrapper::vmul(wn, rvar_vec); + wn = wrapper::vmul(wn, gamma_vec); - // Store results - wrapper::vstore(dwc_w_out_ptr + x, wn); - } + // Store results + wrapper::vstore(dwc_w_out_ptr + x, wn); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; - } - }, - dwc_w_in, dwc_w_out); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; + } + }, + dwc_w_in, dwc_w_out); } -void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_dwc_nchw(dwc_weights, dwc_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_dwc_nchw(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_dwc_nchw(dwc_weights, dwc_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_dwc_nchw(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp index 275211ff38..1d88d3b494 100644 --- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp +++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp @@ -30,11 +30,19 @@ namespace arm_compute { namespace cpu { -void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_dwc_nhwc(dwc_weights, dwc_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_dwc_nhwc(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } } // namespace cpu diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp index 67169c5325..1f336bb196 100644 --- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp +++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp @@ -29,11 +29,19 @@ namespace arm_compute { namespace cpu { -void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_dwc_nhwc(dwc_weights, dwc_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_dwc_nhwc(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } } // namespace cpu diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h index 6f0386276f..5b74a7aef6 100644 --- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h +++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -32,8 +33,16 @@ namespace arm_compute namespace cpu { template -void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { using ScalarType = T; const int size = 16 / dwc_weights->info()->element_size(); @@ -53,13 +62,20 @@ void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITenso Iterator dwc_w_in(dwc_weights, win); Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win); - const auto dwc_bias_in = (dwc_bias != nullptr ? reinterpret_cast(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); - auto dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast(fused_bias->ptr_to_element(Coordinates(0, 0)))); + const auto dwc_bias_in = + (dwc_bias != nullptr ? reinterpret_cast(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); + auto dwc_bias_out = + (run_in_place_bias ? dwc_bias_in + : reinterpret_cast(fused_bias->ptr_to_element(Coordinates(0, 0)))); const auto input_mean = reinterpret_cast(bn_mean->ptr_to_element(Coordinates(0, 0))); const auto input_var = reinterpret_cast(bn_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (bn_beta != nullptr) ? reinterpret_cast(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_gamma = (bn_gamma != nullptr) + ? reinterpret_cast(bn_gamma->ptr_to_element(Coordinates(0, 0))) + : nullptr; + const auto input_beta = (bn_beta != nullptr) + ? reinterpret_cast(bn_beta->ptr_to_element(Coordinates(0, 0))) + : nullptr; auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); @@ -73,81 +89,84 @@ void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITenso auto beta = ScalarType(0.0); auto dwc_bias_in_scalar = ScalarType(0); - execute_window_loop(win, [&](const Coordinates & id) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - var_vec = wrapper::vloadq(input_var + x); - if(input_gamma != nullptr) - { - gamma_vec = wrapper::vloadq(input_gamma + x); - } - - if((id[2] == 0) && (id[1] == 0)) + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - mean_vec = wrapper::vloadq(input_mean + x); - - // Construct vectors - if(input_beta != nullptr) + var_vec = wrapper::vloadq(input_var + x); + if (input_gamma != nullptr) { - beta_vec = wrapper::vloadq(input_beta + x); + gamma_vec = wrapper::vloadq(input_gamma + x); } - if(dwc_bias_in != nullptr) + if ((id[2] == 0) && (id[1] == 0)) { - dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x); + mean_vec = wrapper::vloadq(input_mean + x); + + // Construct vectors + if (input_beta != nullptr) + { + beta_vec = wrapper::vloadq(input_beta + x); + } + + if (dwc_bias_in != nullptr) + { + dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x); + } + + auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec), + wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec))); + dwc_bias_tmp_vec = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec); + wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec); } - auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec), wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec))); - dwc_bias_tmp_vec = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec); - wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec); - } - - auto dwc_w_in_ptr = reinterpret_cast(dwc_w_in.ptr()); - auto dwc_w_out_ptr = reinterpret_cast(dwc_w_out.ptr()); + auto dwc_w_in_ptr = reinterpret_cast(dwc_w_in.ptr()); + auto dwc_w_out_ptr = reinterpret_cast(dwc_w_out.ptr()); - auto wn = wrapper::vloadq(dwc_w_in_ptr + x); - rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - wn = wrapper::vmul(wn, rvar_vec); - wn = wrapper::vmul(wn, gamma_vec); + auto wn = wrapper::vloadq(dwc_w_in_ptr + x); + rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + wn = wrapper::vmul(wn, rvar_vec); + wn = wrapper::vmul(wn, gamma_vec); - // Store results - wrapper::vstore(dwc_w_out_ptr + x, wn); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - auto var = input_var[x]; - if(input_gamma != nullptr) - { - gamma = input_gamma[x]; + // Store results + wrapper::vstore(dwc_w_out_ptr + x, wn); } - if(id[2] == 0 && id[1] == 0) + // Compute left-over elements + for (; x < window_end_x; ++x) { - auto mean = input_mean[x]; - if(input_beta != nullptr) + auto var = input_var[x]; + if (input_gamma != nullptr) { - beta = input_beta[x]; + gamma = input_gamma[x]; } - if(dwc_bias_in != nullptr) + + if (id[2] == 0 && id[1] == 0) { - dwc_bias_in_scalar = dwc_bias_in[x]; + auto mean = input_mean[x]; + if (input_beta != nullptr) + { + beta = input_beta[x]; + } + if (dwc_bias_in != nullptr) + { + dwc_bias_in_scalar = dwc_bias_in[x]; + } + + auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); + dwc_bias_out[x] = (dwc_bias_tmp_scalar * gamma) + beta; } - auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); - dwc_bias_out[x] = (dwc_bias_tmp_scalar * gamma) + beta; - } - - const auto dwc_w_in_ptr = reinterpret_cast(dwc_w_in.ptr()); - auto dwc_w_out_ptr = reinterpret_cast(dwc_w_out.ptr()); + const auto dwc_w_in_ptr = reinterpret_cast(dwc_w_in.ptr()); + auto dwc_w_out_ptr = reinterpret_cast(dwc_w_out.ptr()); - *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; - } - }, - dwc_w_in, dwc_w_out); + *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; + } + }, + dwc_w_in, dwc_w_out); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp index 505a37174e..4d7507a5da 100644 --- a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp +++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp @@ -48,30 +48,32 @@ void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window, Iterator in(src, win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - float16x8x2_t alpha_ab = vld2q_f16(out_ptr + x); - const float16x8x2_t c = vld2q_f16(in_ptr + x); - // Multiply matrix C by its weight and accumulate - alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16)); - alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16)); + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); - vst2q_f16(out_ptr + x, alpha_ab); - } + int x = window_start_x; + for (; x < (window_end_x - window_step_x); x += window_step_x) + { + float16x8x2_t alpha_ab = vld2q_f16(out_ptr + x); + const float16x8x2_t c = vld2q_f16(in_ptr + x); + // Multiply matrix C by its weight and accumulate + alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16)); + alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16)); - // Left-over loop - for(; x < window_end_x; ++x) - { - *(out_ptr + x) += *(in_ptr + x) * static_cast(beta); - } - }, - in, out); + vst2q_f16(out_ptr + x, alpha_ab); + } + + // Left-over loop + for (; x < window_end_x; ++x) + { + *(out_ptr + x) += *(in_ptr + x) * static_cast(beta); + } + }, + in, out); } } // namespace void neon_fp16_gemm_matrix_add(const ITensor *src, ITensor *dst, const Window &window, float beta) diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp index dd0384ca13..47de0f3928 100644 --- a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp +++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp @@ -23,6 +23,7 @@ */ #include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h" + #include namespace arm_compute @@ -44,33 +45,35 @@ void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window, Iterator in(src, win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - float32x4x4_t alpha_ab = vld4q_f32(out_ptr + x); - const float32x4x4_t c = vld4q_f32(in_ptr + x); + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); - // Multiply matrix C by its weight and accumulate - alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32); - alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32); - alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32); - alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32); + int x = window_start_x; + for (; x < (window_end_x - window_step_x); x += window_step_x) + { + float32x4x4_t alpha_ab = vld4q_f32(out_ptr + x); + const float32x4x4_t c = vld4q_f32(in_ptr + x); - vst4q_f32(out_ptr + x, alpha_ab); - } + // Multiply matrix C by its weight and accumulate + alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32); + alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32); + alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32); + alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32); - // Left-over loop - for(; x < window_end_x; ++x) - { - *(out_ptr + x) += *(in_ptr + x) * beta; - } - }, - in, out); + vst4q_f32(out_ptr + x, alpha_ab); + } + + // Left-over loop + for (; x < window_end_x; ++x) + { + *(out_ptr + x) += *(in_ptr + x) * beta; + } + }, + in, out); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp index 8fd79f9287..60fda511e3 100644 --- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp @@ -32,7 +32,8 @@ namespace arm_compute { namespace cpu { -void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +void vector_matrix_multiply_f16( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) { const auto width_matrix_b = static_cast(dst->info()->dimension(0)); const auto in_b_stride = static_cast(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size()); @@ -42,7 +43,8 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor const int window_start_x = 32 * info.thread_id; const int window_step_x = 32 * info.num_threads; const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; - ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x"); + ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, + " (window_end_x - window_start_x) must be multiple of window_step_x"); Window win_out(window); win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -55,7 +57,7 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) + if (rhs->info()->num_dimensions() >= 3) { win_b = window; } @@ -70,169 +72,172 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor const float16x8_t alpha_f16 = vdupq_n_f16(alpha); - execute_window_loop(win_out, [&](const Coordinates &) - { - int x = window_start_x; - // Here we don't check for x lower equal than (window_end_x - window_step_x) because of - // window_end_x is computed above which may cause out-of-bound writes to the dst. - for(; x < (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_out, + [&](const Coordinates &) { - if(x > width_matrix_b) + int x = window_start_x; + // Here we don't check for x lower equal than (window_end_x - window_step_x) because of + // window_end_x is computed above which may cause out-of-bound writes to the dst. + for (; x < (window_end_x - window_step_x); x += window_step_x) { - return; - } - - auto matrix_b = reinterpret_cast(inb.ptr()) + x; + if (x > width_matrix_b) + { + return; + } - float16x8_t acc0 = vdupq_n_f16(0.f); - float16x8_t acc1 = vdupq_n_f16(0.f); - float16x8_t acc2 = vdupq_n_f16(0.f); - float16x8_t acc3 = vdupq_n_f16(0.f); + auto matrix_b = reinterpret_cast(inb.ptr()) + x; - auto vec_a = reinterpret_cast(ina.ptr()); - const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4);) - { - const float16x4_t a0l = vld1_f16(vec_a); - - float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); - float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); - float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); - float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); - float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); - float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); - float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); - float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); - - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0)); - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1)); - - matrix_b += 2 * in_b_stride; - - b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); - b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); - b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); - b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); - b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); - b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); - b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); - b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); - - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2)); - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3)); - - vec_a += 4; - matrix_b += 2 * in_b_stride; - } + float16x8_t acc0 = vdupq_n_f16(0.f); + float16x8_t acc1 = vdupq_n_f16(0.f); + float16x8_t acc2 = vdupq_n_f16(0.f); + float16x8_t acc3 = vdupq_n_f16(0.f); - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float16_t a0 = *vec_a; - const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); - const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); - const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); - const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); - - acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0)); - acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0)); - acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0)); - acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0)); - - matrix_b += in_b_stride; - } + auto vec_a = reinterpret_cast(ina.ptr()); + const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4);) + { + const float16x4_t a0l = vld1_f16(vec_a); + + float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); + float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); + float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); + float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0)); + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1)); + + matrix_b += 2 * in_b_stride; + + b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); + b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); + b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); + b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2)); + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3)); + + vec_a += 4; + matrix_b += 2 * in_b_stride; + } - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc0 = vmulq_f16(acc0, alpha_f16); - acc1 = vmulq_f16(acc1, alpha_f16); - acc2 = vmulq_f16(acc2, alpha_f16); - acc3 = vmulq_f16(acc3, alpha_f16); - } + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float16_t a0 = *vec_a; + const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0)); + acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0)); + acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0)); + acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0)); + + matrix_b += in_b_stride; + } - auto vec_out = reinterpret_cast(out.ptr()) + x; + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc0 = vmulq_f16(acc0, alpha_f16); + acc1 = vmulq_f16(acc1, alpha_f16); + acc2 = vmulq_f16(acc2, alpha_f16); + acc3 = vmulq_f16(acc3, alpha_f16); + } - vst1q_f16(vec_out + 0, acc0); - vst1q_f16(vec_out + 8, acc1); - vst1q_f16(vec_out + 16, acc2); - vst1q_f16(vec_out + 24, acc3); - } + auto vec_out = reinterpret_cast(out.ptr()) + x; - for(; x < window_end_x; ++x) - { - if(x > width_matrix_b) - { - return; + vst1q_f16(vec_out + 0, acc0); + vst1q_f16(vec_out + 8, acc1); + vst1q_f16(vec_out + 16, acc2); + vst1q_f16(vec_out + 24, acc3); } - auto matrix_b = reinterpret_cast(inb.ptr()) + x; + for (; x < window_end_x; ++x) + { + if (x > width_matrix_b) + { + return; + } - float16x4_t vacc = vdup_n_f16(0.f); + auto matrix_b = reinterpret_cast(inb.ptr()) + x; - auto vec_a = reinterpret_cast(ina.ptr()); - const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4) - { - const float16x4_t a0l = vld1_f16(vec_a); + float16x4_t vacc = vdup_n_f16(0.f); - const float16x4_t b_col = + auto vec_a = reinterpret_cast(ina.ptr()); + const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4) { - *(matrix_b + 0 * in_b_stride), - *(matrix_b + 1 * in_b_stride), - *(matrix_b + 2 * in_b_stride), - *(matrix_b + 3 * in_b_stride), - }; + const float16x4_t a0l = vld1_f16(vec_a); - vacc = vadd_f16(vacc, vmul_f16(a0l, b_col)); + const float16x4_t b_col = { + *(matrix_b + 0 * in_b_stride), + *(matrix_b + 1 * in_b_stride), + *(matrix_b + 2 * in_b_stride), + *(matrix_b + 3 * in_b_stride), + }; - matrix_b += 4 * in_b_stride; - } + vacc = vadd_f16(vacc, vmul_f16(a0l, b_col)); - float16_t acc = vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3); + matrix_b += 4 * in_b_stride; + } - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float16_t a0 = *vec_a; - const float16_t b00 = *matrix_b; + float16_t acc = + vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3); - acc += b00 * a0; + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float16_t a0 = *vec_a; + const float16_t b00 = *matrix_b; - matrix_b += in_b_stride; - } + acc += b00 * a0; - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc *= static_cast(alpha); - } + matrix_b += in_b_stride; + } - auto vec_out = reinterpret_cast(out.ptr()) + x; + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc *= static_cast(alpha); + } - *(vec_out) = acc; - } - }, - ina, inb, out); + auto vec_out = reinterpret_cast(out.ptr()) + x; + + *(vec_out) = acc; + } + }, + ina, inb, out); } -void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +void matrix_matrix_multiply_f16( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) { ARM_COMPUTE_UNUSED(info); - const int out_width = static_cast(dst->info()->dimension(0)); - const int out_height = static_cast(dst->info()->dimension(1)); - const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); - const size_t out_stride = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); + const int out_width = static_cast(dst->info()->dimension(0)); + const int out_height = static_cast(dst->info()->dimension(1)); + const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); + const size_t out_stride = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); const int num_elems_matrix_b_x = rhs->info()->dimension(0); // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix @@ -243,7 +248,7 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) + if (rhs->info()->num_dimensions() >= 3) { win_b = window; } @@ -259,22 +264,16 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor const float16x8_t alpha_f16 = vdupq_n_f16(alpha); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto *mtx_a0 = reinterpret_cast(ina.ptr()); - const auto *mtx_b0 = reinterpret_cast(inb.ptr()); - auto *mtx_out = reinterpret_cast(out.ptr()); - float16x8x4_t c = + execute_window_loop( + window, + [&](const Coordinates &id) { - { - vdupq_n_f16(0.f), - vdupq_n_f16(0.f), - vdupq_n_f16(0.f), - vdupq_n_f16(0.f) - } - }; + const auto *mtx_a0 = reinterpret_cast(ina.ptr()); + const auto *mtx_b0 = reinterpret_cast(inb.ptr()); + auto *mtx_out = reinterpret_cast(out.ptr()); + float16x8x4_t c = {{vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f)}}; - /* + /* This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) |a00 a01 a02 a03 | a04 a05 a06 a07| |a10 a11 a12 a13 | a14 a15 a16 a17| @@ -302,111 +301,118 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size. */ - const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; - - for(; mtx_b0 <= (mtx_b0_end_addr - 32);) - - { - const float16x8_t p00 = vld1q_f16(mtx_a0); - const float16x8_t p02 = vld1q_f16(mtx_a0 + 8); - - const float16x8_t q00 = vld1q_f16(mtx_b0); - const float16x8_t q02 = vld1q_f16(mtx_b0 + 8); - const float16x8_t q04 = vld1q_f16(mtx_b0 + 16); - const float16x8_t q06 = vld1q_f16(mtx_b0 + 24); - - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3))); + const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7))); + for (; mtx_b0 <= (mtx_b0_end_addr - 32);) - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3))); - - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7))); - - mtx_a0 += 16; - mtx_b0 += 32; - } + { + const float16x8_t p00 = vld1q_f16(mtx_a0); + const float16x8_t p02 = vld1q_f16(mtx_a0 + 8); + + const float16x8_t q00 = vld1q_f16(mtx_b0); + const float16x8_t q02 = vld1q_f16(mtx_b0 + 8); + const float16x8_t q04 = vld1q_f16(mtx_b0 + 16); + const float16x8_t q06 = vld1q_f16(mtx_b0 + 24); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7))); + + mtx_a0 += 16; + mtx_b0 += 32; + } - for(; mtx_b0 < mtx_b0_end_addr;) + for (; mtx_b0 < mtx_b0_end_addr;) - { - const float16x4_t p00 = vld1_f16(mtx_a0); - const float16x8_t q00 = vld1q_f16(mtx_b0); + { + const float16x4_t p00 = vld1_f16(mtx_a0); + const float16x8_t q00 = vld1q_f16(mtx_b0); - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3))); + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3))); - mtx_a0 += 4; - mtx_b0 += 8; - } + mtx_a0 += 4; + mtx_b0 += 8; + } - if(multiply_alpha) - { - c.val[0] = vmulq_f16(c.val[0], alpha_f16); - c.val[1] = vmulq_f16(c.val[1], alpha_f16); - c.val[2] = vmulq_f16(c.val[2], alpha_f16); - c.val[3] = vmulq_f16(c.val[3], alpha_f16); - } + if (multiply_alpha) + { + c.val[0] = vmulq_f16(c.val[0], alpha_f16); + c.val[1] = vmulq_f16(c.val[1], alpha_f16); + c.val[2] = vmulq_f16(c.val[2], alpha_f16); + c.val[3] = vmulq_f16(c.val[3], alpha_f16); + } - if(id.x() < (out_width - 8)) - { - vst1q_f16(mtx_out, c.val[0]); - if(id.y() + 1 < out_height) + if (id.x() < (out_width - 8)) { - vst1q_f16(mtx_out + 1 * out_stride, c.val[1]); - if(id.y() + 2 < out_height) + vst1q_f16(mtx_out, c.val[0]); + if (id.y() + 1 < out_height) { - vst1q_f16(mtx_out + 2 * out_stride, c.val[2]); - if(id.y() + 3 < out_height) + vst1q_f16(mtx_out + 1 * out_stride, c.val[1]); + if (id.y() + 2 < out_height) { - vst1q_f16(mtx_out + 3 * out_stride, c.val[3]); + vst1q_f16(mtx_out + 2 * out_stride, c.val[2]); + if (id.y() + 3 < out_height) + { + vst1q_f16(mtx_out + 3 * out_stride, c.val[3]); + } } } } - } - else - { - // Left-over columns - const int columns_left = out_width - id.x(); - for(int x = 0; x < columns_left; ++x) + else { - *(mtx_out + x) = c.val[0][x]; - if(id.y() + 1 < out_height) + // Left-over columns + const int columns_left = out_width - id.x(); + for (int x = 0; x < columns_left; ++x) { - *(mtx_out + x + 1 * out_stride) = c.val[1][x]; - if(id.y() + 2 < out_height) + *(mtx_out + x) = c.val[0][x]; + if (id.y() + 1 < out_height) { - *(mtx_out + x + 2 * out_stride) = c.val[2][x]; - if(id.y() + 3 < out_height) + *(mtx_out + x + 1 * out_stride) = c.val[1][x]; + if (id.y() + 2 < out_height) { - *(mtx_out + x + 3 * out_stride) = c.val[3][x]; + *(mtx_out + x + 2 * out_stride) = c.val[2][x]; + if (id.y() + 3 < out_height) + { + *(mtx_out + x + 3 * out_stride) = c.val[3][x]; + } } } } } - } - }, - ina, inb, out); + }, + ina, inb, out); } -void neon_fp16_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector) +void neon_fp16_gemm_matrix_mul(const ITensor *lhs, + const ITensor *rhs, + ITensor *dst, + const Window &window, + const ThreadInfo &info, + float alpha, + const bool is_dst_vector) { - return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha); + return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha) + : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha); } -} // namespce cpu +} // namespace cpu } // namespace arm_compute #endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp index 9c1f6f3c0f..e12a312280 100644 --- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp @@ -28,9 +28,16 @@ namespace arm_compute { namespace cpu { -void neon_fp32_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector) +void neon_fp32_gemm_matrix_mul(const ITensor *lhs, + const ITensor *rhs, + ITensor *dst, + const Window &window, + const ThreadInfo &info, + float alpha, + const bool is_dst_vector) { - return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha); + return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha) + : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha); } -} // namespce cpu -} // namespace arm_compute \ No newline at end of file +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp index 0051d3d9dc..404d070a37 100644 --- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp @@ -23,6 +23,7 @@ */ #include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h" + #include "src/core/utils/helpers/float_ops.h" #include @@ -31,10 +32,12 @@ namespace arm_compute { namespace cpu { -void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +void vector_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) { - const auto width_matrix_b = static_cast(dst->info()->dimension(0)); - const auto in_b_stride = static_cast(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type())); + const auto width_matrix_b = static_cast(dst->info()->dimension(0)); + const auto in_b_stride = + static_cast(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type())); const auto num_elems_vec_a = static_cast(lhs->info()->dimension(0)); // The implementation computes 16 elements per iteration @@ -54,7 +57,7 @@ void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) + if (rhs->info()->num_dimensions() >= 3) { win_b = window; } @@ -69,209 +72,220 @@ void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor const float32x4_t alpha_f32 = vdupq_n_f32(alpha); - execute_window_loop(win_out, [&](const Coordinates &) - { - int x = window_start_x; - // Here we don't check for x lower equal than (window_end_x - window_step_x) because of - // window_end_x is computed above which may cause out-of-bound writes to the dst. - for(; x < (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_out, + [&](const Coordinates &) { - if(x > width_matrix_b) + int x = window_start_x; + // Here we don't check for x lower equal than (window_end_x - window_step_x) because of + // window_end_x is computed above which may cause out-of-bound writes to the dst. + for (; x < (window_end_x - window_step_x); x += window_step_x) { - return; - } + if (x > width_matrix_b) + { + return; + } - float32x4_t acc0 = vdupq_n_f32(0.f); - float32x4_t acc1 = vdupq_n_f32(0.f); - float32x4_t acc2 = vdupq_n_f32(0.f); - float32x4_t acc3 = vdupq_n_f32(0.f); + float32x4_t acc0 = vdupq_n_f32(0.f); + float32x4_t acc1 = vdupq_n_f32(0.f); + float32x4_t acc2 = vdupq_n_f32(0.f); + float32x4_t acc3 = vdupq_n_f32(0.f); - auto vec_a = reinterpret_cast(ina.ptr()); - auto matrix_b = reinterpret_cast(inb.ptr()) + x; + auto vec_a = reinterpret_cast(ina.ptr()); + auto matrix_b = reinterpret_cast(inb.ptr()) + x; #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b + in_b_stride))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b + in_b_stride))); #endif /* __arm__ */ - auto vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4);) - { - float32x2_t a0l = vld1_f32(vec_a); + auto vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4);) + { + float32x2_t a0l = vld1_f32(vec_a); - float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); - float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); - float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); - float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); - float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); - float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); - float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); - float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 1 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 2 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 3 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 4 * in_b_stride))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 1 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 2 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 3 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 4 * in_b_stride))); #endif /* __arm__ */ - acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); - acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); - acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); - acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); - acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); - acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); - acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); - acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); - vec_a += 2; - matrix_b += 2 * in_b_stride; + vec_a += 2; + matrix_b += 2 * in_b_stride; - a0l = vld1_f32(vec_a); + a0l = vld1_f32(vec_a); - b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); - b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); - b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); - b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); - b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); - b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); - b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); - b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); - acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); - acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); - acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); - acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); - acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); - acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); - acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); - acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); - vec_a += 2; - matrix_b += 2 * in_b_stride; - } + vec_a += 2; + matrix_b += 2 * in_b_stride; + } - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float a0 = *vec_a; + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float a0 = *vec_a; - const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); - const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); - const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); - const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); - acc0 = vmlaq_n_f32(acc0, b00, a0); - acc1 = vmlaq_n_f32(acc1, b01, a0); - acc2 = vmlaq_n_f32(acc2, b02, a0); - acc3 = vmlaq_n_f32(acc3, b03, a0); + acc0 = vmlaq_n_f32(acc0, b00, a0); + acc1 = vmlaq_n_f32(acc1, b01, a0); + acc2 = vmlaq_n_f32(acc2, b02, a0); + acc3 = vmlaq_n_f32(acc3, b03, a0); - matrix_b += in_b_stride; - } + matrix_b += in_b_stride; + } - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc0 = vmulq_f32(acc0, alpha_f32); - acc1 = vmulq_f32(acc1, alpha_f32); - acc2 = vmulq_f32(acc2, alpha_f32); - acc3 = vmulq_f32(acc3, alpha_f32); - } + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc0 = vmulq_f32(acc0, alpha_f32); + acc1 = vmulq_f32(acc1, alpha_f32); + acc2 = vmulq_f32(acc2, alpha_f32); + acc3 = vmulq_f32(acc3, alpha_f32); + } - const auto vec_out = reinterpret_cast(out.ptr()) + x; + const auto vec_out = reinterpret_cast(out.ptr()) + x; - vst1q_f32(vec_out + 0, acc0); - vst1q_f32(vec_out + 4, acc1); - vst1q_f32(vec_out + 8, acc2); - vst1q_f32(vec_out + 12, acc3); - } + vst1q_f32(vec_out + 0, acc0); + vst1q_f32(vec_out + 4, acc1); + vst1q_f32(vec_out + 8, acc2); + vst1q_f32(vec_out + 12, acc3); + } - // Left-over loop - for(; x < window_end_x; ++x) - { - if(x > width_matrix_b) + // Left-over loop + for (; x < window_end_x; ++x) { - return; - } + if (x > width_matrix_b) + { + return; + } - float32x4_t vacc = vdupq_n_f32(0.f); + float32x4_t vacc = vdupq_n_f32(0.f); - auto vec_a = reinterpret_cast(ina.ptr()); - auto matrix_b = reinterpret_cast(inb.ptr()) + x; + auto vec_a = reinterpret_cast(ina.ptr()); + auto matrix_b = reinterpret_cast(inb.ptr()) + x; #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b + in_b_stride))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b + in_b_stride))); #endif /* __arm__ */ - auto vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4) - { - const float32x4_t a0l = vld1q_f32(vec_a); - - const float32x4_t b_col = + auto vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4) { - *(matrix_b + 0 * in_b_stride), - *(matrix_b + 1 * in_b_stride), - *(matrix_b + 2 * in_b_stride), - *(matrix_b + 3 * in_b_stride), - }; + const float32x4_t a0l = vld1q_f32(vec_a); + + const float32x4_t b_col = { + *(matrix_b + 0 * in_b_stride), + *(matrix_b + 1 * in_b_stride), + *(matrix_b + 2 * in_b_stride), + *(matrix_b + 3 * in_b_stride), + }; #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 1 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 2 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 3 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 4 * in_b_stride))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 1 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 2 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 3 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 4 * in_b_stride))); #endif /* __arm__ */ - vacc = vmlaq_f32(vacc, b_col, a0l); + vacc = vmlaq_f32(vacc, b_col, a0l); - matrix_b += 4 * in_b_stride; - } + matrix_b += 4 * in_b_stride; + } - float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + vgetq_lane_f32(vacc, 3); + float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + + vgetq_lane_f32(vacc, 3); - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float a0 = *vec_a; + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float a0 = *vec_a; - const float b00 = *matrix_b; + const float b00 = *matrix_b; - acc += b00 * a0; + acc += b00 * a0; - matrix_b += in_b_stride; - } + matrix_b += in_b_stride; + } - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc *= alpha; - } + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc *= alpha; + } - const auto vec_out = reinterpret_cast(out.ptr()) + x; + const auto vec_out = reinterpret_cast(out.ptr()) + x; - *vec_out = acc; - } - }, - ina, inb, out); + *vec_out = acc; + } + }, + ina, inb, out); } -void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +void matrix_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) { ARM_COMPUTE_UNUSED(info); - const int out_width = static_cast(dst->info()->dimension(0)); - const int out_height = static_cast(dst->info()->dimension(1)); - const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); - const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); - const size_t out_stride2 = out_stride1 * 2; - const size_t out_stride3 = out_stride1 * 3; + const int out_width = static_cast(dst->info()->dimension(0)); + const int out_height = static_cast(dst->info()->dimension(1)); + const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); + const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); + const size_t out_stride2 = out_stride1 * 2; + const size_t out_stride3 = out_stride1 * 3; const int num_elems_matrix_b_x = rhs->info()->dimension(0); // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix @@ -282,7 +296,7 @@ void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) + if (rhs->info()->num_dimensions() >= 3) { win_b = window; } @@ -302,338 +316,340 @@ void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration // All the values needed for computing a single 4x4 block will be read from consecutive memory positions - execute_window_loop(window, [&](const Coordinates & id) - { - auto mtx_a0 = reinterpret_cast(ina.ptr()); - auto mtx_b0 = reinterpret_cast(inb.ptr()); - auto mtx_b1 = mtx_b0 + in_b_stride; + execute_window_loop( + window, + [&](const Coordinates &id) + { + auto mtx_a0 = reinterpret_cast(ina.ptr()); + auto mtx_b0 = reinterpret_cast(inb.ptr()); + auto mtx_b1 = mtx_b0 + in_b_stride; - float32x4_t acc00 = vdupq_n_f32(0.f); - float32x4_t acc10 = vdupq_n_f32(0.f); - float32x4_t acc20 = vdupq_n_f32(0.f); - float32x4_t acc30 = vdupq_n_f32(0.f); + float32x4_t acc00 = vdupq_n_f32(0.f); + float32x4_t acc10 = vdupq_n_f32(0.f); + float32x4_t acc20 = vdupq_n_f32(0.f); + float32x4_t acc30 = vdupq_n_f32(0.f); - float32x4_t acc01 = vdupq_n_f32(0.f); - float32x4_t acc11 = vdupq_n_f32(0.f); - float32x4_t acc21 = vdupq_n_f32(0.f); - float32x4_t acc31 = vdupq_n_f32(0.f); + float32x4_t acc01 = vdupq_n_f32(0.f); + float32x4_t acc11 = vdupq_n_f32(0.f); + float32x4_t acc21 = vdupq_n_f32(0.f); + float32x4_t acc31 = vdupq_n_f32(0.f); #if __arm__ - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_a0))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_b0))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_b1))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_a0))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_b0))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_b1))); #endif /* __arm__ */ - auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; - for(; mtx_b0 <= (mtx_b0_end_addr - 32);) - { - float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); - float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); - float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); - float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; + for (; mtx_b0 <= (mtx_b0_end_addr - 32);) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); - float32x4_t b00 = vld1q_f32(mtx_b0); - float32x4_t b10 = vld1q_f32(mtx_b1); - float32x4_t b01 = vld1q_f32(mtx_b0 + 4); - float32x4_t b11 = vld1q_f32(mtx_b1 + 4); + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); + float32x4_t b01 = vld1q_f32(mtx_b0 + 4); + float32x4_t b11 = vld1q_f32(mtx_b1 + 4); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_a0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b1))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b1))); #endif /* __arm__ */ - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4); - float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5); - float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6); - float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4); + float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5); + float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6); + float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_a0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b1))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b1))); #endif /* __arm__ */ - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - } - - for(; mtx_b0 < mtx_b0_end_addr;) - { - float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); - float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); - float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); - float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); - float32x4_t b00 = vld1q_f32(mtx_b0); - float32x4_t b10 = vld1q_f32(mtx_b1); + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + } + + for (; mtx_b0 < mtx_b0_end_addr;) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); #if __arm__ - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_a0))); - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b0))); - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b1))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_a0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b1))); #endif /* __arm__ */ - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - mtx_a0 += 4; - mtx_b0 += 4; - mtx_b1 += 4; - } - - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc00 = vmulq_f32(acc00, alpha_f32); - acc10 = vmulq_f32(acc10, alpha_f32); - acc20 = vmulq_f32(acc20, alpha_f32); - acc30 = vmulq_f32(acc30, alpha_f32); - acc01 = vmulq_f32(acc01, alpha_f32); - acc11 = vmulq_f32(acc11, alpha_f32); - acc21 = vmulq_f32(acc21, alpha_f32); - acc31 = vmulq_f32(acc31, alpha_f32); - } - - const auto mtx_out0 = reinterpret_cast(out.ptr()); - const auto mtx_out1 = mtx_out0 + 4; - - if(id.x() < (out_width - 8)) - { - vst1q_f32(mtx_out0, acc00); - vst1q_f32(mtx_out1, acc01); - if(id.y() + 1 < out_height) + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + mtx_a0 += 4; + mtx_b0 += 4; + mtx_b1 += 4; + } + + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc00 = vmulq_f32(acc00, alpha_f32); + acc10 = vmulq_f32(acc10, alpha_f32); + acc20 = vmulq_f32(acc20, alpha_f32); + acc30 = vmulq_f32(acc30, alpha_f32); + acc01 = vmulq_f32(acc01, alpha_f32); + acc11 = vmulq_f32(acc11, alpha_f32); + acc21 = vmulq_f32(acc21, alpha_f32); + acc31 = vmulq_f32(acc31, alpha_f32); + } + + const auto mtx_out0 = reinterpret_cast(out.ptr()); + const auto mtx_out1 = mtx_out0 + 4; + + if (id.x() < (out_width - 8)) { - vst1q_f32(mtx_out0 + out_stride1, acc10); - vst1q_f32(mtx_out1 + out_stride1, acc11); - if(id.y() + 2 < out_height) + vst1q_f32(mtx_out0, acc00); + vst1q_f32(mtx_out1, acc01); + if (id.y() + 1 < out_height) { - vst1q_f32(mtx_out0 + out_stride2, acc20); - vst1q_f32(mtx_out1 + out_stride2, acc21); - if(id.y() + 3 < out_height) + vst1q_f32(mtx_out0 + out_stride1, acc10); + vst1q_f32(mtx_out1 + out_stride1, acc11); + if (id.y() + 2 < out_height) { - vst1q_f32(mtx_out0 + out_stride3, acc30); - vst1q_f32(mtx_out1 + out_stride3, acc31); + vst1q_f32(mtx_out0 + out_stride2, acc20); + vst1q_f32(mtx_out1 + out_stride2, acc21); + if (id.y() + 3 < out_height) + { + vst1q_f32(mtx_out0 + out_stride3, acc30); + vst1q_f32(mtx_out1 + out_stride3, acc31); + } } } } - } - else if(id.x() < (out_width - 4)) - { - vst1q_f32(mtx_out0, acc00); - if(id.y() + 1 < out_height) + else if (id.x() < (out_width - 4)) { - vst1q_f32(mtx_out0 + out_stride1, acc10); - if(id.y() + 2 < out_height) + vst1q_f32(mtx_out0, acc00); + if (id.y() + 1 < out_height) { - vst1q_f32(mtx_out0 + out_stride2, acc20); - if(id.y() + 3 < out_height) + vst1q_f32(mtx_out0 + out_stride1, acc10); + if (id.y() + 2 < out_height) { - vst1q_f32(mtx_out0 + out_stride3, acc30); + vst1q_f32(mtx_out0 + out_stride2, acc20); + if (id.y() + 3 < out_height) + { + vst1q_f32(mtx_out0 + out_stride3, acc30); + } } } - } - // Left-over columns - const int columns_left = out_width - id.x() - 4; - for(auto x = 0; x < columns_left; ++x) - { - *(mtx_out1 + x) = acc01[x]; - if(id.y() + 1 < out_height) + // Left-over columns + const int columns_left = out_width - id.x() - 4; + for (auto x = 0; x < columns_left; ++x) { - *(mtx_out1 + x + out_stride1) = acc11[x]; - if(id.y() + 2 < out_height) + *(mtx_out1 + x) = acc01[x]; + if (id.y() + 1 < out_height) { - *(mtx_out1 + x + out_stride2) = acc21[x]; - if(id.y() + 3 < out_height) + *(mtx_out1 + x + out_stride1) = acc11[x]; + if (id.y() + 2 < out_height) { - *(mtx_out1 + x + out_stride3) = acc31[x]; + *(mtx_out1 + x + out_stride2) = acc21[x]; + if (id.y() + 3 < out_height) + { + *(mtx_out1 + x + out_stride3) = acc31[x]; + } } } } } - } - else - { - // Left-over columns - const int columns_left = out_width - id.x(); - for(int x = 0; x < columns_left; ++x) + else { - *(mtx_out0 + x) = acc00[x]; - if(id.y() + 1 < out_height) + // Left-over columns + const int columns_left = out_width - id.x(); + for (int x = 0; x < columns_left; ++x) { - *(mtx_out0 + x + out_stride1) = acc10[x]; - if(id.y() + 2 < out_height) + *(mtx_out0 + x) = acc00[x]; + if (id.y() + 1 < out_height) { - *(mtx_out0 + x + out_stride2) = acc20[x]; - if(id.y() + 3 < out_height) + *(mtx_out0 + x + out_stride1) = acc10[x]; + if (id.y() + 2 < out_height) { - *(mtx_out0 + x + out_stride3) = acc30[x]; + *(mtx_out0 + x + out_stride2) = acc20[x]; + if (id.y() + 3 < out_height) + { + *(mtx_out0 + x + out_stride3) = acc30[x]; + } } } } } - } - }, - ina, inb, out); + }, + ina, inb, out); } } // namespace cpu diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h index f9f1f247ac..74ea4c2b17 100644 --- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h @@ -24,15 +24,18 @@ #ifndef SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H #define SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/CPP/Validate.h" namespace arm_compute { namespace cpu { -void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); +void vector_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); -void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); +void matrix_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_mul/list.h b/src/cpu/kernels/gemm_matrix_mul/list.h index 9cdb58ae06..15b23b1d81 100644 --- a/src/cpu/kernels/gemm_matrix_mul/list.h +++ b/src/cpu/kernels/gemm_matrix_mul/list.h @@ -27,8 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name) \ - void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector) +#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name) \ + void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, \ + float alpha, const bool is_dst_vector) DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp32_gemm_matrix_mul); DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp16_gemm_matrix_mul); #undef DECLARE_GEMMMATRIXMUL_KERNEL diff --git a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp index d4e469b691..4ed7e54f1c 100644 --- a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp +++ b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp @@ -27,10 +27,13 @@ namespace arm_compute { namespace cpu { -void neon_fp16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window) +void neon_fp16_computeallanchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) { return compute_all_anchors(anchors, all_anchors, anchors_info, window); } -} +} // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp index 09aa6ecec4..f15cd63bb2 100644 --- a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp +++ b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp @@ -26,9 +26,12 @@ namespace arm_compute { namespace cpu { -void neon_fp32_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window) +void neon_fp32_computeallanchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) { return compute_all_anchors(anchors, all_anchors, anchors_info, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.cpp b/src/cpu/kernels/genproposals/generic/neon/impl.cpp index 9224e32a94..8cb76f3afb 100644 --- a/src/cpu/kernels/genproposals/generic/neon/impl.cpp +++ b/src/cpu/kernels/genproposals/generic/neon/impl.cpp @@ -28,7 +28,10 @@ class ITensor; class Window; namespace cpu { -void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window) +void compute_all_anchors_qasymm16(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) { Iterator all_anchors_it(all_anchors, window); Iterator anchors_it(all_anchors, window); @@ -39,28 +42,30 @@ void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform(); - execute_window_loop(window, [&](const Coordinates & id) - { - const size_t anchor_offset = id.y() % num_anchors; + execute_window_loop( + window, + [&](const Coordinates &id) + { + const size_t anchor_offset = id.y() % num_anchors; - const auto out_anchor_ptr = reinterpret_cast(all_anchors_it.ptr()); - const auto anchor_ptr = reinterpret_cast(anchors->ptr_to_element(Coordinates(0, anchor_offset))); + const auto out_anchor_ptr = reinterpret_cast(all_anchors_it.ptr()); + const auto anchor_ptr = reinterpret_cast(anchors->ptr_to_element(Coordinates(0, anchor_offset))); - const size_t shift_idy = id.y() / num_anchors; - const float shiftx = (shift_idy % feat_width) * stride; - const float shifty = (shift_idy / feat_width) * stride; + const size_t shift_idy = id.y() / num_anchors; + const float shiftx = (shift_idy % feat_width) * stride; + const float shifty = (shift_idy / feat_width) * stride; - const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx; - const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty; - const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx; - const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty; + const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx; + const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty; + const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx; + const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty; - *out_anchor_ptr = quantize_qsymm16(new_anchor_x1, qinfo.scale); - *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale); - *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale); - *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale); - }, - all_anchors_it); + *out_anchor_ptr = quantize_qsymm16(new_anchor_x1, qinfo.scale); + *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale); + *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale); + *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale); + }, + all_anchors_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.h b/src/cpu/kernels/genproposals/generic/neon/impl.h index da052c9192..3317bcfbe6 100644 --- a/src/cpu/kernels/genproposals/generic/neon/impl.h +++ b/src/cpu/kernels/genproposals/generic/neon/impl.h @@ -26,13 +26,17 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Window.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace cpu { template -void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window) +void compute_all_anchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) { Iterator all_anchors_it(all_anchors, window); Iterator anchors_it(all_anchors, window); @@ -41,26 +45,31 @@ void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAn const T stride = 1.f / anchors_info.spatial_scale(); const size_t feat_width = anchors_info.feat_width(); - execute_window_loop(window, [&](const Coordinates & id) - { - const size_t anchor_offset = id.y() % num_anchors; + execute_window_loop( + window, + [&](const Coordinates &id) + { + const size_t anchor_offset = id.y() % num_anchors; - const auto out_anchor_ptr = reinterpret_cast(all_anchors_it.ptr()); - const auto anchor_ptr = reinterpret_cast(anchors->ptr_to_element(Coordinates(0, anchor_offset))); + const auto out_anchor_ptr = reinterpret_cast(all_anchors_it.ptr()); + const auto anchor_ptr = reinterpret_cast(anchors->ptr_to_element(Coordinates(0, anchor_offset))); - const size_t shift_idy = id.y() / num_anchors; - const T shiftx = (shift_idy % feat_width) * stride; - const T shifty = (shift_idy / feat_width) * stride; + const size_t shift_idy = id.y() / num_anchors; + const T shiftx = (shift_idy % feat_width) * stride; + const T shifty = (shift_idy / feat_width) * stride; - *out_anchor_ptr = *anchor_ptr + shiftx; - *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty; - *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx; - *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty; - }, - all_anchors_it); + *out_anchor_ptr = *anchor_ptr + shiftx; + *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty; + *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx; + *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty; + }, + all_anchors_it); } -void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window); +void compute_all_anchors_qasymm16(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window); } // namespace cpu } // namespace arm_compute #endif //define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H diff --git a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp index cfb5a41d6e..7182d0b27d 100644 --- a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp +++ b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp @@ -26,9 +26,12 @@ namespace arm_compute { namespace cpu { -void neon_qu16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window) +void neon_qu16_computeallanchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) { return compute_all_anchors_qasymm16(anchors, all_anchors, anchors_info, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp index 2b7d91b144..44418c0bb9 100644 --- a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp +++ b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp @@ -23,6 +23,7 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/instancenorm/generic/neon/impl.h" @@ -40,7 +41,10 @@ void vector_float_sum_fp16(AccType &result, AccType &result_square, const InputT } template -InputType vector_float_norm_fp16(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta) +InputType vector_float_norm_fp16(const InputType &inputs, + const AccType &vec_mean, + const AccType &vec_multip, + const AccType &vec_beta) { return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta); } @@ -52,19 +56,24 @@ inline void vector_float_sum_fp16(float32x4_t &result, float32x4_t &result_squar vector_float_sum_fp16(result, result_square, wrapper::vcvt(wrapper::vgethigh(inputs))); } template <> -inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta) +inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs, + const float32x4_t &vec_mean, + const float32x4_t &vec_multip, + const float32x4_t &vec_beta) { - const auto input_low = wrapper::vcvt(wrapper::vgetlow(inputs)); - const auto input_high = wrapper::vcvt(wrapper::vgethigh(inputs)); - const auto result_low = wrapper::vcvt(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta)); - const auto result_high = wrapper::vcvt(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta)); - float16x8_t result = wrapper::vcombine(result_low, result_high); + const auto input_low = wrapper::vcvt(wrapper::vgetlow(inputs)); + const auto input_high = wrapper::vcvt(wrapper::vgethigh(inputs)); + const auto result_low = wrapper::vcvt(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta)); + const auto result_high = + wrapper::vcvt(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta)); + float16x8_t result = wrapper::vcombine(result_low, result_high); return result; } template -void instance_normalization_nchw_fp16(const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window) +void instance_normalization_nchw_fp16( + const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; @@ -78,91 +87,105 @@ void instance_normalization_nchw_fp16(const ITensor *input, ITensor *output, flo const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); Iterator input_it(input, win); - execute_window_loop(win, [&](const Coordinates & id) - { - Window win_plane = window; - win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); - win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); - - Iterator input_plane_it(input, win_plane); - Iterator output_plane_it(output, win_plane); - - auto sum_h_w = static_cast(0.f); - auto sum_squares_h_w = static_cast(0.f); - - execute_window_loop(win_plane, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input_plane_it.ptr()); - - auto vec_sum_h_w = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - - // Compute S elements per iteration - int x = window.x().start(); - for(; x <= (window.x().end() - window_step_x); x += window_step_x) - { - auto vec_input_val = wrapper::vloadq(input_ptr + x); - vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val); - } - - auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); - auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w)); - - vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); - vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); - - sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); - sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); - - // Compute left-over elements - for(; x < window.x().end(); ++x) - { - const auto value = static_cast(*(input_ptr + x)); - sum_h_w += value; - sum_squares_h_w += value * value; - } - }, - input_plane_it, output_plane_it); - - const auto mean_h_w = sum_h_w / elements_plane; - const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; - - const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon); - const auto vec_mean_h_w = wrapper::vdup_n(static_cast(mean_h_w), ExactTagType{}); - const auto vec_multip_h_w = wrapper::vdup_n(static_cast(multip_h_w), ExactTagType{}); - const auto vec_beta = wrapper::vdup_n(static_cast(beta), ExactTagType{}); - - execute_window_loop(win_plane, [&](const Coordinates &) + execute_window_loop( + win, + [&](const Coordinates &id) { - auto input_ptr = reinterpret_cast(input_plane_it.ptr()); - auto output_ptr = reinterpret_cast(output_plane_it.ptr()); - - // Compute S elements per iteration - int x = window.x().start(); - for(; x <= (window.x().end() - window_step_x); x += window_step_x) - { - const auto vec_val = wrapper::vloadq(input_ptr + x); - const auto normalized_vec = vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta); - wrapper::vstore(output_ptr + x, normalized_vec); - } - - // Compute left-over elements - for(; x < window.x().end(); ++x) - { - const auto val = static_cast(*(input_ptr + x)); - *(output_ptr + x) = static_cast((val - mean_h_w) * multip_h_w + beta); - } + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast(0.f); + auto sum_squares_h_w = static_cast(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val); + } + + auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w)); + + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = static_cast(*(input_ptr + x)); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast(multip_h_w), ExactTagType{}); + const auto vec_beta = wrapper::vdup_n(static_cast(beta), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + const auto vec_val = wrapper::vloadq(input_ptr + x); + const auto normalized_vec = + vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta); + wrapper::vstore(output_ptr + x, normalized_vec); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto val = static_cast(*(input_ptr + x)); + *(output_ptr + x) = static_cast((val - mean_h_w) * multip_h_w + beta); + } + }, + input_plane_it, output_plane_it); }, - input_plane_it, output_plane_it); - }, - input_it); -} + input_it); } - -void neon_fp16_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window) +} // namespace + +void neon_fp16_instancenorm(ITensor *input, + ITensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision, + const Window &window) { - if(use_mixed_precision) + if (use_mixed_precision) { return instance_normalization_nchw_fp16(input, output, gamma, beta, epsilon, window); } diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp index 061dd9585c..e1ca05518d 100644 --- a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp +++ b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp @@ -26,7 +26,13 @@ namespace arm_compute { namespace cpu { -void neon_fp32_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window) +void neon_fp32_instancenorm(ITensor *input, + ITensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision, + const Window &window) { ARM_COMPUTE_UNUSED(use_mixed_precision); return instance_normalization_nchw(input, output, gamma, beta, epsilon, window); diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp index 483b6f568b..515079e1b5 100644 --- a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp +++ b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/kernels/instancenorm/generic/neon/impl.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -38,13 +39,15 @@ void vector_float_sum(AccType &result, AccType &result_square, const InputType & } template -InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta) +InputType +vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta) { return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta); } template -void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window) +void instance_normalization_nchw( + ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; @@ -58,88 +61,96 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, f const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); Iterator input_it(input, win); - execute_window_loop(win, [&](const Coordinates & id) - { - Window win_plane = window; - win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); - win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); - - Iterator input_plane_it(input, win_plane); - Iterator output_plane_it(output, win_plane); - - auto sum_h_w = static_cast(0.f); - auto sum_squares_h_w = static_cast(0.f); - - execute_window_loop(win_plane, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input_plane_it.ptr()); - - auto vec_sum_h_w = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - - // Compute S elements per iteration - int x = window.x().start(); - for(; x <= (window.x().end() - window_step_x); x += window_step_x) - { - auto vec_input_val = wrapper::vloadq(input_ptr + x); - vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val); - } - - auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); - auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w)); - - vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); - vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); - - sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); - sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); - - // Compute left-over elements - for(; x < window.x().end(); ++x) - { - const auto value = static_cast(*(input_ptr + x)); - sum_h_w += value; - sum_squares_h_w += value * value; - } - }, - input_plane_it, output_plane_it); - - const auto mean_h_w = sum_h_w / elements_plane; - const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; - - const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon); - const auto vec_mean_h_w = wrapper::vdup_n(static_cast(mean_h_w), ExactTagType{}); - const auto vec_multip_h_w = wrapper::vdup_n(static_cast(multip_h_w), ExactTagType{}); - const auto vec_beta = wrapper::vdup_n(static_cast(beta), ExactTagType{}); - - execute_window_loop(win_plane, [&](const Coordinates &) + execute_window_loop( + win, + [&](const Coordinates &id) { - auto input_ptr = reinterpret_cast(input_plane_it.ptr()); - auto output_ptr = reinterpret_cast(output_plane_it.ptr()); - - // Compute S elements per iteration - int x = window.x().start(); - //auto vec_val = wrapper::vdup_n(static_cast(0.0f), ExactTagType{}); - for(; x <= (window.x().end() - window_step_x); x += window_step_x) - { - const auto vec_val = wrapper::vloadq(input_ptr + x); - const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta); - wrapper::vstore(output_ptr + x, normalized_vec); - } - - // Compute left-over elements - for(; x < window.x().end(); ++x) - { - const auto val = static_cast(*(input_ptr + x)); - *(output_ptr + x) = static_cast((val - mean_h_w) * multip_h_w + beta); - } + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast(0.f); + auto sum_squares_h_w = static_cast(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val); + } + + auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w)); + + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = static_cast(*(input_ptr + x)); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast(multip_h_w), ExactTagType{}); + const auto vec_beta = wrapper::vdup_n(static_cast(beta), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + //auto vec_val = wrapper::vdup_n(static_cast(0.0f), ExactTagType{}); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + const auto vec_val = wrapper::vloadq(input_ptr + x); + const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta); + wrapper::vstore(output_ptr + x, normalized_vec); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto val = static_cast(*(input_ptr + x)); + *(output_ptr + x) = static_cast((val - mean_h_w) * multip_h_w + beta); + } + }, + input_plane_it, output_plane_it); }, - input_plane_it, output_plane_it); - }, - input_it); + input_it); } -template void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); +template void instance_normalization_nchw( + ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.h b/src/cpu/kernels/instancenorm/generic/neon/impl.h index 0ddfcdd5ba..e1cc7487f7 100644 --- a/src/cpu/kernels/instancenorm/generic/neon/impl.h +++ b/src/cpu/kernels/instancenorm/generic/neon/impl.h @@ -32,13 +32,15 @@ namespace arm_compute namespace cpu { template -void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); +void instance_normalization_nchw( + ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); template void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs); template -InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta); +InputType +vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta); } // namespace cpu } // namespace arm_compute #endif //define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H diff --git a/src/cpu/kernels/instancenorm/list.h b/src/cpu/kernels/instancenorm/list.h index 54f1d3213f..51b496c41d 100644 --- a/src/cpu/kernels/instancenorm/list.h +++ b/src/cpu/kernels/instancenorm/list.h @@ -27,8 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_INSTANCENORM_KERNEL(func_name) \ - void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window) +#define DECLARE_INSTANCENORM_KERNEL(func_name) \ + void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, \ + const Window &window) DECLARE_INSTANCENORM_KERNEL(neon_fp32_instancenorm); DECLARE_INSTANCENORM_KERNEL(neon_fp16_instancenorm); #undef DECLARE_INSTANCENORM_KERNEL diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp index b503a8b734..32d9ca4eac 100644 --- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp @@ -24,18 +24,17 @@ #include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/AssemblyUtils.h" - #include "src/core/NEON/kernels/assembly/depthwise.hpp" +#include "src/core/utils/AssemblyUtils.h" #include "depthwise_common.hpp" - #include namespace arm_compute @@ -54,9 +53,13 @@ constexpr unsigned int idx_channels = 0; constexpr unsigned int idx_batches = 3; template -void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst, - const ConvolutionInfo &info, const CPUInfo &cpu_info, - std::unique_ptr &kernel, std::string &_name) +void create_arm_dwc(const ITensorInfo *src, + const ITensorInfo *weights, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info, + std::unique_ptr &kernel, + std::string &_name) { unsigned int stride_cols{}; unsigned int stride_rows{}; @@ -79,13 +82,13 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); - arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols, - n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, - padding, activation, nullptr); + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels, + dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr); // Configure assembly pooling kernel auto dwc_kernel_asm = arm_conv::depthwise::depthwise(args); - if(dwc_kernel_asm == nullptr) + if (dwc_kernel_asm == nullptr) { // Configuration not supported: Leave function unconfigured: return; @@ -96,11 +99,16 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI } template -void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst, - const ConvolutionInfo &info, const CPUInfo &cpu_info, +void create_arm_dwc_quant(const ITensorInfo *src, + const ITensorInfo *weights, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info, std::unique_ptr &kernel, - std::vector &multipliers, std::vector &right_shifts, std::vector &left_shifts, - std::string &_name) + std::vector &multipliers, + std::vector &right_shifts, + std::vector &left_shifts, + std::string &_name) { unsigned int stride_cols{}; unsigned int stride_rows{}; @@ -123,9 +131,9 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); - arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols, - n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, - padding, activation, nullptr); + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels, + dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr); const auto src_qinfo = src->quantization_info().uniform(); const auto weights_qinfo = weights->quantization_info(); @@ -135,64 +143,50 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT multipliers.resize(num_filters); std::vector dst_shifts(num_filters); - quantization::compute_quantized_multipliers_and_shifts(src, - weights, - dst, - multipliers.data(), - dst_shifts.data()); + quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, multipliers.data(), dst_shifts.data()); // Quantize activation bounds int32_t min_activation = std::numeric_limits::lowest(); int32_t max_activation = std::numeric_limits::max(); - if(info.act_info.enabled()) + if (info.act_info.enabled()) { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo); + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo); } // Set quantization parameters for assembly kernels arm_gemm::Requantize32 requant_args{}; - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { left_shifts.resize(num_filters); right_shifts.resize(num_filters); bool need_left_shift = false; // Select more optimized path if left shift is not needed - for(unsigned int i = 0; i < num_filters; ++i) + for (unsigned int i = 0; i < num_filters; ++i) { left_shifts[i] = std::max(-dst_shifts[i], static_cast(0)); right_shifts[i] = std::min(-dst_shifts[i], static_cast(0)); - if(dst_shifts[i] < 0 && !need_left_shift) + if (dst_shifts[i] < 0 && !need_left_shift) { need_left_shift = true; } } - requant_args = arm_gemm::Requantize32(nullptr, - 0, - src_qinfo.offset, - weights_qinfo.uniform().offset, - dst_qinfo.offset, - (need_left_shift) ? left_shifts.data() : nullptr, - right_shifts.data(), - multipliers.data(), - static_cast(min_activation), - static_cast(max_activation)); + requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset, + dst_qinfo.offset, (need_left_shift) ? left_shifts.data() : nullptr, + right_shifts.data(), multipliers.data(), + static_cast(min_activation), static_cast(max_activation)); } else { - requant_args = arm_gemm::Requantize32(nullptr, - 0, - src_qinfo.offset, - weights_qinfo.uniform().offset, - dst_qinfo.offset, - -dst_shifts[0], - multipliers[0], - static_cast(min_activation), - static_cast(max_activation)); + requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset, + dst_qinfo.offset, -dst_shifts[0], multipliers[0], + static_cast(min_activation), static_cast(max_activation)); } // Configure assembly pooling kernel with requantization - auto dwc_kernel_asm = arm_conv::depthwise::depthwise(args, requant_args); - if(dwc_kernel_asm == nullptr) + auto dwc_kernel_asm = + arm_conv::depthwise::depthwise(args, requant_args); + if (dwc_kernel_asm == nullptr) { // Configuration not supported: Leave function unconfigured: return; @@ -203,18 +197,18 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT } // namespace CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel() - : _kernel_asm(nullptr), - _multipliers(), - _left_shifts(), - _right_shifts(), - _name() + : _kernel_asm(nullptr), _multipliers(), _left_shifts(), _right_shifts(), _name() { } CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default; -void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst, - const ConvolutionInfo &info, const CPUInfo &cpu_info) +void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info) { ARM_COMPUTE_UNUSED(cpu_info); ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); @@ -225,24 +219,30 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, _name = "CpuDepthwiseConv2dAssemblyWrapperKernel"; std::string asm_kernel_name(""); #if defined(__aarch64__) - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { - create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name); + create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, + _multipliers, _right_shifts, _left_shifts, + asm_kernel_name); } else { - create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name); + create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, + _multipliers, _right_shifts, _left_shifts, + asm_kernel_name); } break; case DataType::QASYMM8_SIGNED: - create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name); + create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, + _right_shifts, _left_shifts, asm_kernel_name); break; #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) case DataType::F16: - create_arm_dwc(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name); + create_arm_dwc(src, weights, dst, info, cpu_info, _kernel_asm, + asm_kernel_name); break; #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) case DataType::F32: @@ -255,13 +255,17 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, Window win = calculate_max_window(*dst, Steps()); ICpuKernel::configure(win); - if(_kernel_asm != nullptr) + if (_kernel_asm != nullptr) { _name += "/" + asm_kernel_name; } } -Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info) +Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); @@ -269,10 +273,12 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels"); #endif // !defined(__aarch64__) ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, + "Only NHWC is supported by assembly kernels"); - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); @@ -282,12 +288,12 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0)); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -297,7 +303,7 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, } } - if(dst->total_size() > 0) + if (dst->total_size() > 0) { const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); @@ -305,17 +311,15 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, } // Assembly kernels cannot work with padding greater than the kernel. - const auto &padding = info.pad_stride_info; - const auto &dilation = info.dilation; + const auto &padding = info.pad_stride_info; + const auto &dilation = info.dilation; const auto &wei_shape = weights->tensor_shape(); const auto dilated_wei_w = wei_shape[1] + (wei_shape[1] - 1) * (dilation.x() - 1); const auto dilated_wei_h = wei_shape[2] + (wei_shape[2] - 1) * (dilation.y() - 1); - ARM_COMPUTE_RETURN_ERROR_ON( - padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w || - padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h - ); + ARM_COMPUTE_RETURN_ERROR_ON(padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w || + padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h); return Status{}; } @@ -351,13 +355,12 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom); const size_t ld_dst_batch = ld_dst_row * dst_shape[2]; - _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, - parameters_ptr, - dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, - working_space, info.thread_id, info.num_threads); + _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, parameters_ptr, dst_ptr, ld_dst_col, ld_dst_row, + ld_dst_batch, working_space, info.thread_id, info.num_threads); } -void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row) +void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters( + void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row) { _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row); } diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h index f61cb1b09c..fadaefb999 100644 --- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" #include "src/cpu/kernels/CpuKernelSelectionTypes.h" @@ -35,8 +36,8 @@ namespace depthwise { // Forward declarations class IDepthwiseCommon; -} // depthwise -} // arm_conv +} // namespace depthwise +} // namespace arm_conv namespace arm_compute { @@ -66,7 +67,12 @@ public: * @param[in] info Depthwise convolution layer meta-data. * @param[in] cpu_info CPU information needed to select the most appropriate kernel. */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info); /** Indicates whether or not this function can be used to process the given parameters. * @@ -74,10 +80,14 @@ public: * * @return a status. */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; /** Pack bias and weights in a storage space for the assembly kernel @@ -88,7 +98,8 @@ public: * @param[in] ld_weights_col Columns displacement for the weights tensor. * @param[in] ld_weights_row Rows displacement for the weights tensor. */ - void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row); + void pack_parameters( + void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row); /** Get the amount of storage space required for the rearranged weights and bias. * diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp index 10ff4183c0..a161c800fd 100644 --- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp +++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp @@ -22,14 +22,16 @@ * SOFTWARE. */ #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" + #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/INEKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/INEKernel.h" #include @@ -41,7 +43,10 @@ namespace kernels { using namespace arm_compute::misc::shape_calculator; -void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) +void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info) { ARM_COMPUTE_UNUSED(cpu_info); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -52,10 +57,10 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn #if defined(__aarch64__) const bool requantize = src->quantization_info() != dst->quantization_info(); - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: - if(requantize) + if (requantize) { create_arm_pooling_requant(src, dst, info, cpu_info); } @@ -65,7 +70,7 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn } break; case DataType::QASYMM8_SIGNED: - if(requantize) + if (requantize) { create_arm_pooling_requant(src, dst, info, cpu_info); } @@ -91,7 +96,8 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn INEKernel::configure(win); } -Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info) +Status +CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); @@ -99,43 +105,52 @@ Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const IT ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels"); #endif /* __aarch64__ */ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), + "Only NHWC is supported by assembly kernels"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX), "Only AVG and MAX pooling are supported by assembly kernels"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(info), "Pooling region that is entirely outside input tensor is unsupported by assembly kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_pool_region_entirely_outside_input(info), + "Pooling region that is entirely outside input tensor is unsupported by assembly kernels"); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); const auto src_qinfo = src->quantization_info().uniform(); const auto dst_qinfo = dst->quantization_info().uniform(); - if(src_qinfo != dst_qinfo) + if (src_qinfo != dst_qinfo) { const float multiplier = src_qinfo.scale / dst_qinfo.scale; int32_t dst_multiplier{}; int32_t dst_shift{}; - ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift)); + ARM_COMPUTE_RETURN_ERROR_ON( + quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift)); } else { - if(src->data_type() == DataType::QASYMM8) + if (src->data_type() == DataType::QASYMM8) { const bool has_padding = info.pad_stride_info.has_padding(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !info.exclude_padding && has_padding, + "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); } } } else { - if(src->data_type() == DataType::QASYMM8) + if (src->data_type() == DataType::QASYMM8) { // If dst is not configured, the quantization info are the same const bool has_padding = info.pad_stride_info.has_padding(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !info.exclude_padding && has_padding, + "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); } } return Status{}; @@ -154,9 +169,10 @@ void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window & ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0); - const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); - auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); - auto working_space = (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes(); + const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); + auto working_space = + (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes(); const auto src_shape = src->info()->tensor_shape(); const auto dst_shape = dst->info()->tensor_shape(); @@ -170,8 +186,7 @@ void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window & const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom); const size_t ld_dst_batch = ld_dst_row * dst_shape[2]; - _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, - out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, + _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, working_space, info.thread_id, info.num_threads); } @@ -186,9 +201,14 @@ bool CpuPool2dAssemblyWrapperKernel::is_configured() const } template -void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) +void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info) { - const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX; + const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) + ? arm_conv::pooling::PoolingType::AVERAGE + : arm_conv::pooling::PoolingType::MAX; arm_conv::pooling::PoolingWindow window{}; window.cols = static_cast(info.pool_size.x()); @@ -197,7 +217,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, arm_conv::pooling::PoolingStride stride{}; std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); - const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; + const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), + info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()}; constexpr unsigned int idx_width = 1; constexpr unsigned int idx_height = 2; @@ -211,11 +232,12 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, const unsigned int dst_rows = dst->dimension(idx_height); const unsigned int dst_cols = dst->dimension(idx_width); - arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); + arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, + src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); // Configure assembly pooling kernel auto pooling_kernel_asm = arm_conv::pooling::pooling(args); - if(pooling_kernel_asm == nullptr) + if (pooling_kernel_asm == nullptr) { // Configuration not supported: Leave function unconfigured: return; @@ -225,9 +247,14 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, } template -void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) +void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info) { - const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX; + const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) + ? arm_conv::pooling::PoolingType::AVERAGE + : arm_conv::pooling::PoolingType::MAX; arm_conv::pooling::PoolingWindow window{}; window.cols = static_cast(info.pool_size.x()); @@ -236,7 +263,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf arm_conv::pooling::PoolingStride stride{}; std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); - const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; + const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), + info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()}; constexpr unsigned int idx_width = 1; constexpr unsigned int idx_height = 2; @@ -250,7 +278,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf const unsigned int dst_rows = dst->dimension(idx_height); const unsigned int dst_cols = dst->dimension(idx_width); - arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); + arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, + src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); const auto src_qinfo = src->quantization_info().uniform(); const auto dst_qinfo = dst->quantization_info().uniform(); @@ -260,15 +289,15 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf int32_t dst_shift{}; quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift); - const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, - dst_qinfo.offset, + const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, dst_qinfo.offset, dst_shift, // left shift 0, // right shift dst_multiplier); // Configure assembly pooling kernel with requantization - auto pooling_kernel_asm = arm_conv::pooling::pooling(args, requant_args); - if(pooling_kernel_asm == nullptr) + auto pooling_kernel_asm = + arm_conv::pooling::pooling(args, requant_args); + if (pooling_kernel_asm == nullptr) { // Configuration not supported: Leave function unconfigured: return; diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h index 8713d5c54d..b4ff1e6f2d 100644 --- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h +++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h @@ -25,8 +25,9 @@ #define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H #include "arm_compute/core/Types.h" -#include "src/core/NEON/kernels/assembly/pooling.hpp" + #include "src/core/common/Macros.h" +#include "src/core/NEON/kernels/assembly/pooling.hpp" #include "src/cpu/ICpuKernel.h" #include "src/cpu/kernels/CpuKernelSelectionTypes.h" @@ -101,7 +102,8 @@ private: * @param[in] info Pooling layer meta-data. */ template - void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); + void + create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); /** Helper function to create the assembly kernel with requantization support * @@ -110,9 +112,12 @@ private: * @param[in] info Pooling layer meta-data. */ template - void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); + void create_arm_pooling_requant(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info); - std::unique_ptr _kernel_asm{ nullptr }; + std::unique_ptr _kernel_asm{nullptr}; /** Return minimum workload size of the relevant kernel * diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp index 661c3d7f46..6c6527de06 100644 --- a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp +++ b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp @@ -32,13 +32,15 @@ namespace arm_compute { namespace cpu { -void neon_fp16_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis) +void neon_fp16_l2_normalize_x( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis) { ARM_COMPUTE_UNUSED(unused_axis); return l2_normalize_x(in, sum, out, epsilon, window); } -void neon_fp16_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +void neon_fp16_l2_normalize_yz( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) { return l2_normalize_yz(in, sum, out, epsilon, window, axis); } diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp index be32bdc4fa..520877068c 100644 --- a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp +++ b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp @@ -22,21 +22,23 @@ * SOFTWARE. */ -#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h" - #include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h" + namespace arm_compute { namespace cpu { -void neon_fp32_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis) +void neon_fp32_l2_normalize_x( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis) { ARM_COMPUTE_UNUSED(unused_axis); return l2_normalize_x(in, sum, out, epsilon, window); } -void neon_fp32_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +void neon_fp32_l2_normalize_yz( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) { return l2_normalize_yz(in, sum, out, epsilon, window, axis); } diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.h b/src/cpu/kernels/l2normlayer/generic/neon/impl.h index a06cdd33d3..6bd19299b7 100644 --- a/src/cpu/kernels/l2normlayer/generic/neon/impl.h +++ b/src/cpu/kernels/l2normlayer/generic/neon/impl.h @@ -26,8 +26,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" #include @@ -51,33 +52,36 @@ void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float e Iterator sum_it(sum, win_collapsed); Iterator output_it(out, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(input_it.ptr()); - const auto out_ptr = reinterpret_cast(output_it.ptr()); - - const T sum_value = *reinterpret_cast(sum_it.ptr()); - const T norm_value = static_cast(1.f) / std::sqrt(std::max(sum_value, static_cast(epsilon))); - const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{}); - - // Compute elements over vector steps - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - out_ptr[x] = in_ptr[x] * norm_value; - } - }, - input_it, sum_it, output_it); + const auto in_ptr = reinterpret_cast(input_it.ptr()); + const auto out_ptr = reinterpret_cast(output_it.ptr()); + + const T sum_value = *reinterpret_cast(sum_it.ptr()); + const T norm_value = static_cast(1.f) / std::sqrt(std::max(sum_value, static_cast(epsilon))); + const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{}); + + // Compute elements over vector steps + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + out_ptr[x] = in_ptr[x] * norm_value; + } + }, + input_it, sum_it, output_it); } template -void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +void l2_normalize_yz( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) { using ExactTagType = typename wrapper::traits::neon_vector::tag_type; @@ -97,28 +101,30 @@ void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float const auto vec_eps = wrapper::vdup_n(static_cast(epsilon), ExactTagType{}); - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(input_it.ptr()); - const auto sum_ptr = reinterpret_cast(sum_it.ptr()); - const auto out_ptr = reinterpret_cast(output_it.ptr()); - - // Compute elements over vector steps - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps)); - wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const T norm_value = static_cast(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast(epsilon))); - out_ptr[x] = in_ptr[x] * norm_value; - } - }, - input_it, sum_it, output_it); + const auto in_ptr = reinterpret_cast(input_it.ptr()); + const auto sum_ptr = reinterpret_cast(sum_it.ptr()); + const auto out_ptr = reinterpret_cast(output_it.ptr()); + + // Compute elements over vector steps + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps)); + wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const T norm_value = static_cast(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast(epsilon))); + out_ptr[x] = in_ptr[x] * norm_value; + } + }, + input_it, sum_it, output_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/l2normlayer/list.h b/src/cpu/kernels/l2normlayer/list.h index 2bad7f54f5..e2a879d06e 100644 --- a/src/cpu/kernels/l2normlayer/list.h +++ b/src/cpu/kernels/l2normlayer/list.h @@ -27,8 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_L2NORMLAYER_KERNEL(func_name) \ - void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +#define DECLARE_L2NORMLAYER_KERNEL(func_name) \ + void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, \ + size_t axis) DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_x); DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_yz); diff --git a/src/cpu/kernels/lut/generic/neon/u8.cpp b/src/cpu/kernels/lut/generic/neon/u8.cpp index 8ab647bfee..5516f5b33d 100644 --- a/src/cpu/kernels/lut/generic/neon/u8.cpp +++ b/src/cpu/kernels/lut/generic/neon/u8.cpp @@ -32,376 +32,374 @@ namespace cpu #ifdef __aarch64__ void lut_u8_neon( - const uint8_t *table, - size_t num_strings, - size_t string_length, - const uint8_t *const *input, - uint8_t *const *output) + const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output) { - __asm__ __volatile__( - "ldr q16, [%x[table], #0x0]\n" - "ldr q17, [%x[table], #0x10]\n" - "mov x23, #0x0\n" - "ldr q18, [%x[table], #0x20]\n" - "ldr q19, [%x[table], #0x30]\n" - "ldr q20, [%x[table], #0x40]\n" - "ldr q21, [%x[table], #0x50]\n" - "ldr q22, [%x[table], #0x60]\n" - "ldr q23, [%x[table], #0x70]\n" - "ldr q24, [%x[table], #0x80]\n" - "ldr q25, [%x[table], #0x90]\n" - "ldr q26, [%x[table], #0xa0]\n" - "ldr q27, [%x[table], #0xb0]\n" - "ldr q28, [%x[table], #0xc0]\n" - "ldr q29, [%x[table], #0xd0]\n" - "ldr q30, [%x[table], #0xe0]\n" - "ldr q31, [%x[table], #0xf0]\n" - "1:" // string loop - "ldr x22, [%x[input], x23, LSL #0x3]\n" - "ldr x21, [%x[output], x23, LSL #0x3]\n" - "movi v11.16b, #0x40\n" - "movi v10.16b, #0x80\n" - "movi v9.16b, #0xc0\n" - "mov x20, %x[string_length]\n" - "2:" // 4 rounds: width loop - "cmp x20, #0x30\n" - "bge 27f\n" - "tbz x20, #5, 10f\n" - "ld1 { v8.16b }, [x22], #0x10\n" - "ld1 { v13.16b }, [x22], #0x10\n" - "tbz x20, #3, 6f\n" - "ldr d12, [x22], #0x8\n" - "tbz x20, #2, 4f\n" - "ld1 { v12.s }[2], [x22], #0x4\n" - "tbz x20, #1, 3f\n" - "ld1 { v12.h }[6], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[14], [x22]\n" - "b 26f\n" - "3:" // 4 rounds: Partial load: partial_1_44 - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[12], [x22]\n" - "b 26f\n" - "4:" // 4 rounds: Partial load: partial_2_40 - "tbz x20, #1, 5f\n" - "ld1 { v12.h }[4], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[10], [x22]\n" - "b 26f\n" - "5:" // 4 rounds: Partial load: partial_1_40 - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[8], [x22]\n" - "b 26f\n" - "6:" // 4 rounds: Partial load: partial_4_32 - "tbz x20, #2, 8f\n" - "ldr s12, [x22], #0x4\n" - "tbz x20, #1, 7f\n" - "ld1 { v12.h }[2], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[6], [x22]\n" - "b 26f\n" - "7:" // 4 rounds: Partial load: partial_1_36 - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[4], [x22]\n" - "b 26f\n" - "8:" // 4 rounds: Partial load: partial_2_32 - "tbz x20, #1, 9f\n" - "ldr h12, [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[2], [x22]\n" - "b 26f\n" - "9:" // 4 rounds: Partial load: partial_1_32 - "tbz x20, #0, 26f\n" - "ldr b12, [x22, #0x0]\n" - "b 26f\n" - "10:" // 4 rounds: Partial load: partial_16_0 - "tbz x20, #4, 18f\n" - "ld1 { v8.16b }, [x22], #0x10\n" - "tbz x20, #3, 14f\n" - "ldr d13, [x22], #0x8\n" - "tbz x20, #2, 12f\n" - "ld1 { v13.s }[2], [x22], #0x4\n" - "tbz x20, #1, 11f\n" - "ld1 { v13.h }[6], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[14], [x22]\n" - "b 26f\n" - "11:" // 4 rounds: Partial load: partial_1_28 - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[12], [x22]\n" - "b 26f\n" - "12:" // 4 rounds: Partial load: partial_2_24 - "tbz x20, #1, 13f\n" - "ld1 { v13.h }[4], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[10], [x22]\n" - "b 26f\n" - "13:" // 4 rounds: Partial load: partial_1_24 - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[8], [x22]\n" - "b 26f\n" - "14:" // 4 rounds: Partial load: partial_4_16 - "tbz x20, #2, 16f\n" - "ldr s13, [x22], #0x4\n" - "tbz x20, #1, 15f\n" - "ld1 { v13.h }[2], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[6], [x22]\n" - "b 26f\n" - "15:" // 4 rounds: Partial load: partial_1_20 - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[4], [x22]\n" - "b 26f\n" - "16:" // 4 rounds: Partial load: partial_2_16 - "tbz x20, #1, 17f\n" - "ldr h13, [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[2], [x22]\n" - "b 26f\n" - "17:" // 4 rounds: Partial load: partial_1_16 - "tbz x20, #0, 26f\n" - "ldr b13, [x22, #0x0]\n" - "b 26f\n" - "18:" // 4 rounds: Partial load: partial_8_0 - "tbz x20, #3, 22f\n" - "ldr d8, [x22], #0x8\n" - "tbz x20, #2, 20f\n" - "ld1 { v8.s }[2], [x22], #0x4\n" - "tbz x20, #1, 19f\n" - "ld1 { v8.h }[6], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[14], [x22]\n" - "b 26f\n" - "19:" // 4 rounds: Partial load: partial_1_12 - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[12], [x22]\n" - "b 26f\n" - "20:" // 4 rounds: Partial load: partial_2_8 - "tbz x20, #1, 21f\n" - "ld1 { v8.h }[4], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[10], [x22]\n" - "b 26f\n" - "21:" // 4 rounds: Partial load: partial_1_8 - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[8], [x22]\n" - "b 26f\n" - "22:" // 4 rounds: Partial load: partial_4_0 - "tbz x20, #2, 24f\n" - "ldr s8, [x22], #0x4\n" - "tbz x20, #1, 23f\n" - "ld1 { v8.h }[2], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[6], [x22]\n" - "b 26f\n" - "23:" // 4 rounds: Partial load: partial_1_4 - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[4], [x22]\n" - "b 26f\n" - "24:" // 4 rounds: Partial load: partial_2_0 - "tbz x20, #1, 25f\n" - "ldr h8, [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[2], [x22]\n" - "b 26f\n" - "25:" // 4 rounds: Partial load: partial_1_0 - "ldr b8, [x22, #0x0]\n" - "26:" // 4 rounds: Partial load: Done - "b 28f\n" - "27:" // 4 rounds: Full load - "ldr q8, [x22, #0x0]\n" - "ldr q13, [x22, #0x10]\n" - "ldr q12, [x22, #0x20]\n" - "add x22, x22, #0x30\n" - "28:" // 4 rounds: Load done - "sub v0.16b, v8.16b, v11.16b\n" - "sub v7.16b, v8.16b, v10.16b\n" - "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n" - "sub v6.16b, v8.16b, v9.16b\n" - "sub v5.16b, v13.16b, v11.16b\n" - "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n" - "sub v4.16b, v13.16b, v10.16b\n" - "sub v3.16b, v13.16b, v9.16b\n" - "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n" - "sub v2.16b, v12.16b, v11.16b\n" - "sub v1.16b, v12.16b, v10.16b\n" - "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n" - "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n" - "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n" - "orr v8.16b, v8.16b, v0.16b\n" - "sub v0.16b, v12.16b, v9.16b\n" - "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n" - "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n" - "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n" - "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n" - "orr v7.16b, v7.16b, v6.16b\n" - "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n" - "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n" - "orr v13.16b, v13.16b, v5.16b\n" - "orr v4.16b, v4.16b, v3.16b\n" - "orr v12.16b, v12.16b, v2.16b\n" - "cmp x20, #0x30\n" - "orr v1.16b, v1.16b, v0.16b\n" - "orr v8.16b, v8.16b, v7.16b\n" - "orr v13.16b, v13.16b, v4.16b\n" - "orr v12.16b, v12.16b, v1.16b\n" - "bge 53f\n" - "tbz x20, #5, 36f\n" - "st1 { v8.16b }, [x21], #0x10\n" - "st1 { v13.16b }, [x21], #0x10\n" - "tbz x20, #3, 32f\n" - "str d12, [x21], #0x8\n" - "tbz x20, #2, 30f\n" - "st1 { v12.s }[2], [x21], #0x4\n" - "tbz x20, #1, 29f\n" - "st1 { v12.h }[6], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v12.b }[14], [x21]\n" - "b 52f\n" - "29:" // 4 rounds: Partial writeback: partial_1_44 - "tbz x20, #0, 52f\n" - "st1 { v12.b }[12], [x21]\n" - "b 52f\n" - "30:" // 4 rounds: Partial writeback: partial_2_40 - "tbz x20, #1, 31f\n" - "st1 { v12.h }[4], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v12.b }[10], [x21]\n" - "b 52f\n" - "31:" // 4 rounds: Partial writeback: partial_1_40 - "tbz x20, #0, 52f\n" - "st1 { v12.b }[8], [x21]\n" - "b 52f\n" - "32:" // 4 rounds: Partial writeback: partial_4_32 - "tbz x20, #2, 34f\n" - "str s12, [x21], #0x4\n" - "tbz x20, #1, 33f\n" - "st1 { v12.h }[2], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v12.b }[6], [x21]\n" - "b 52f\n" - "33:" // 4 rounds: Partial writeback: partial_1_36 - "tbz x20, #0, 52f\n" - "st1 { v12.b }[4], [x21]\n" - "b 52f\n" - "34:" // 4 rounds: Partial writeback: partial_2_32 - "tbz x20, #1, 35f\n" - "str h12, [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v12.b }[2], [x21]\n" - "b 52f\n" - "35:" // 4 rounds: Partial writeback: partial_1_32 - "tbz x20, #0, 52f\n" - "str b12, [x21, #0x0]\n" - "b 52f\n" - "36:" // 4 rounds: Partial writeback: partial_16_0 - "tbz x20, #4, 44f\n" - "st1 { v8.16b }, [x21], #0x10\n" - "tbz x20, #3, 40f\n" - "str d13, [x21], #0x8\n" - "tbz x20, #2, 38f\n" - "st1 { v13.s }[2], [x21], #0x4\n" - "tbz x20, #1, 37f\n" - "st1 { v13.h }[6], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v13.b }[14], [x21]\n" - "b 52f\n" - "37:" // 4 rounds: Partial writeback: partial_1_28 - "tbz x20, #0, 52f\n" - "st1 { v13.b }[12], [x21]\n" - "b 52f\n" - "38:" // 4 rounds: Partial writeback: partial_2_24 - "tbz x20, #1, 39f\n" - "st1 { v13.h }[4], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v13.b }[10], [x21]\n" - "b 52f\n" - "39:" // 4 rounds: Partial writeback: partial_1_24 - "tbz x20, #0, 52f\n" - "st1 { v13.b }[8], [x21]\n" - "b 52f\n" - "40:" // 4 rounds: Partial writeback: partial_4_16 - "tbz x20, #2, 42f\n" - "str s13, [x21], #0x4\n" - "tbz x20, #1, 41f\n" - "st1 { v13.h }[2], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v13.b }[6], [x21]\n" - "b 52f\n" - "41:" // 4 rounds: Partial writeback: partial_1_20 - "tbz x20, #0, 52f\n" - "st1 { v13.b }[4], [x21]\n" - "b 52f\n" - "42:" // 4 rounds: Partial writeback: partial_2_16 - "tbz x20, #1, 43f\n" - "str h13, [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v13.b }[2], [x21]\n" - "b 52f\n" - "43:" // 4 rounds: Partial writeback: partial_1_16 - "tbz x20, #0, 52f\n" - "str b13, [x21, #0x0]\n" - "b 52f\n" - "44:" // 4 rounds: Partial writeback: partial_8_0 - "tbz x20, #3, 48f\n" - "str d8, [x21], #0x8\n" - "tbz x20, #2, 46f\n" - "st1 { v8.s }[2], [x21], #0x4\n" - "tbz x20, #1, 45f\n" - "st1 { v8.h }[6], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v8.b }[14], [x21]\n" - "b 52f\n" - "45:" // 4 rounds: Partial writeback: partial_1_12 - "tbz x20, #0, 52f\n" - "st1 { v8.b }[12], [x21]\n" - "b 52f\n" - "46:" // 4 rounds: Partial writeback: partial_2_8 - "tbz x20, #1, 47f\n" - "st1 { v8.h }[4], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v8.b }[10], [x21]\n" - "b 52f\n" - "47:" // 4 rounds: Partial writeback: partial_1_8 - "tbz x20, #0, 52f\n" - "st1 { v8.b }[8], [x21]\n" - "b 52f\n" - "48:" // 4 rounds: Partial writeback: partial_4_0 - "tbz x20, #2, 50f\n" - "str s8, [x21], #0x4\n" - "tbz x20, #1, 49f\n" - "st1 { v8.h }[2], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v8.b }[6], [x21]\n" - "b 52f\n" - "49:" // 4 rounds: Partial writeback: partial_1_4 - "tbz x20, #0, 52f\n" - "st1 { v8.b }[4], [x21]\n" - "b 52f\n" - "50:" // 4 rounds: Partial writeback: partial_2_0 - "tbz x20, #1, 51f\n" - "str h8, [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v8.b }[2], [x21]\n" - "b 52f\n" - "51:" // 4 rounds: Partial writeback: partial_1_0 - "str b8, [x21, #0x0]\n" - "52:" // 4 rounds: Partial writeback: Done - "b 54f\n" - "53:" // 4 rounds: Full writeback - "str q8, [x21, #0x0]\n" - "str q13, [x21, #0x10]\n" - "str q12, [x21, #0x20]\n" - "add x21, x21, #0x30\n" - "54:" // 4 rounds: Writeback done - "subs x20, x20, #0x30\n" - "bgt 2b\n" - "add x23, x23, #0x1\n" - "cmp x23, %x[num_strings]\n" - "bne 1b\n" - : - : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length), [table] "r"(table) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"); + __asm__ __volatile__("ldr q16, [%x[table], #0x0]\n" + "ldr q17, [%x[table], #0x10]\n" + "mov x23, #0x0\n" + "ldr q18, [%x[table], #0x20]\n" + "ldr q19, [%x[table], #0x30]\n" + "ldr q20, [%x[table], #0x40]\n" + "ldr q21, [%x[table], #0x50]\n" + "ldr q22, [%x[table], #0x60]\n" + "ldr q23, [%x[table], #0x70]\n" + "ldr q24, [%x[table], #0x80]\n" + "ldr q25, [%x[table], #0x90]\n" + "ldr q26, [%x[table], #0xa0]\n" + "ldr q27, [%x[table], #0xb0]\n" + "ldr q28, [%x[table], #0xc0]\n" + "ldr q29, [%x[table], #0xd0]\n" + "ldr q30, [%x[table], #0xe0]\n" + "ldr q31, [%x[table], #0xf0]\n" + "1:" // string loop + "ldr x22, [%x[input], x23, LSL #0x3]\n" + "ldr x21, [%x[output], x23, LSL #0x3]\n" + "movi v11.16b, #0x40\n" + "movi v10.16b, #0x80\n" + "movi v9.16b, #0xc0\n" + "mov x20, %x[string_length]\n" + "2:" // 4 rounds: width loop + "cmp x20, #0x30\n" + "bge 27f\n" + "tbz x20, #5, 10f\n" + "ld1 { v8.16b }, [x22], #0x10\n" + "ld1 { v13.16b }, [x22], #0x10\n" + "tbz x20, #3, 6f\n" + "ldr d12, [x22], #0x8\n" + "tbz x20, #2, 4f\n" + "ld1 { v12.s }[2], [x22], #0x4\n" + "tbz x20, #1, 3f\n" + "ld1 { v12.h }[6], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[14], [x22]\n" + "b 26f\n" + "3:" // 4 rounds: Partial load: partial_1_44 + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[12], [x22]\n" + "b 26f\n" + "4:" // 4 rounds: Partial load: partial_2_40 + "tbz x20, #1, 5f\n" + "ld1 { v12.h }[4], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[10], [x22]\n" + "b 26f\n" + "5:" // 4 rounds: Partial load: partial_1_40 + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[8], [x22]\n" + "b 26f\n" + "6:" // 4 rounds: Partial load: partial_4_32 + "tbz x20, #2, 8f\n" + "ldr s12, [x22], #0x4\n" + "tbz x20, #1, 7f\n" + "ld1 { v12.h }[2], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[6], [x22]\n" + "b 26f\n" + "7:" // 4 rounds: Partial load: partial_1_36 + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[4], [x22]\n" + "b 26f\n" + "8:" // 4 rounds: Partial load: partial_2_32 + "tbz x20, #1, 9f\n" + "ldr h12, [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[2], [x22]\n" + "b 26f\n" + "9:" // 4 rounds: Partial load: partial_1_32 + "tbz x20, #0, 26f\n" + "ldr b12, [x22, #0x0]\n" + "b 26f\n" + "10:" // 4 rounds: Partial load: partial_16_0 + "tbz x20, #4, 18f\n" + "ld1 { v8.16b }, [x22], #0x10\n" + "tbz x20, #3, 14f\n" + "ldr d13, [x22], #0x8\n" + "tbz x20, #2, 12f\n" + "ld1 { v13.s }[2], [x22], #0x4\n" + "tbz x20, #1, 11f\n" + "ld1 { v13.h }[6], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[14], [x22]\n" + "b 26f\n" + "11:" // 4 rounds: Partial load: partial_1_28 + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[12], [x22]\n" + "b 26f\n" + "12:" // 4 rounds: Partial load: partial_2_24 + "tbz x20, #1, 13f\n" + "ld1 { v13.h }[4], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[10], [x22]\n" + "b 26f\n" + "13:" // 4 rounds: Partial load: partial_1_24 + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[8], [x22]\n" + "b 26f\n" + "14:" // 4 rounds: Partial load: partial_4_16 + "tbz x20, #2, 16f\n" + "ldr s13, [x22], #0x4\n" + "tbz x20, #1, 15f\n" + "ld1 { v13.h }[2], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[6], [x22]\n" + "b 26f\n" + "15:" // 4 rounds: Partial load: partial_1_20 + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[4], [x22]\n" + "b 26f\n" + "16:" // 4 rounds: Partial load: partial_2_16 + "tbz x20, #1, 17f\n" + "ldr h13, [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[2], [x22]\n" + "b 26f\n" + "17:" // 4 rounds: Partial load: partial_1_16 + "tbz x20, #0, 26f\n" + "ldr b13, [x22, #0x0]\n" + "b 26f\n" + "18:" // 4 rounds: Partial load: partial_8_0 + "tbz x20, #3, 22f\n" + "ldr d8, [x22], #0x8\n" + "tbz x20, #2, 20f\n" + "ld1 { v8.s }[2], [x22], #0x4\n" + "tbz x20, #1, 19f\n" + "ld1 { v8.h }[6], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[14], [x22]\n" + "b 26f\n" + "19:" // 4 rounds: Partial load: partial_1_12 + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[12], [x22]\n" + "b 26f\n" + "20:" // 4 rounds: Partial load: partial_2_8 + "tbz x20, #1, 21f\n" + "ld1 { v8.h }[4], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[10], [x22]\n" + "b 26f\n" + "21:" // 4 rounds: Partial load: partial_1_8 + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[8], [x22]\n" + "b 26f\n" + "22:" // 4 rounds: Partial load: partial_4_0 + "tbz x20, #2, 24f\n" + "ldr s8, [x22], #0x4\n" + "tbz x20, #1, 23f\n" + "ld1 { v8.h }[2], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[6], [x22]\n" + "b 26f\n" + "23:" // 4 rounds: Partial load: partial_1_4 + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[4], [x22]\n" + "b 26f\n" + "24:" // 4 rounds: Partial load: partial_2_0 + "tbz x20, #1, 25f\n" + "ldr h8, [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[2], [x22]\n" + "b 26f\n" + "25:" // 4 rounds: Partial load: partial_1_0 + "ldr b8, [x22, #0x0]\n" + "26:" // 4 rounds: Partial load: Done + "b 28f\n" + "27:" // 4 rounds: Full load + "ldr q8, [x22, #0x0]\n" + "ldr q13, [x22, #0x10]\n" + "ldr q12, [x22, #0x20]\n" + "add x22, x22, #0x30\n" + "28:" // 4 rounds: Load done + "sub v0.16b, v8.16b, v11.16b\n" + "sub v7.16b, v8.16b, v10.16b\n" + "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n" + "sub v6.16b, v8.16b, v9.16b\n" + "sub v5.16b, v13.16b, v11.16b\n" + "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n" + "sub v4.16b, v13.16b, v10.16b\n" + "sub v3.16b, v13.16b, v9.16b\n" + "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n" + "sub v2.16b, v12.16b, v11.16b\n" + "sub v1.16b, v12.16b, v10.16b\n" + "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n" + "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n" + "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n" + "orr v8.16b, v8.16b, v0.16b\n" + "sub v0.16b, v12.16b, v9.16b\n" + "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n" + "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n" + "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n" + "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n" + "orr v7.16b, v7.16b, v6.16b\n" + "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n" + "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n" + "orr v13.16b, v13.16b, v5.16b\n" + "orr v4.16b, v4.16b, v3.16b\n" + "orr v12.16b, v12.16b, v2.16b\n" + "cmp x20, #0x30\n" + "orr v1.16b, v1.16b, v0.16b\n" + "orr v8.16b, v8.16b, v7.16b\n" + "orr v13.16b, v13.16b, v4.16b\n" + "orr v12.16b, v12.16b, v1.16b\n" + "bge 53f\n" + "tbz x20, #5, 36f\n" + "st1 { v8.16b }, [x21], #0x10\n" + "st1 { v13.16b }, [x21], #0x10\n" + "tbz x20, #3, 32f\n" + "str d12, [x21], #0x8\n" + "tbz x20, #2, 30f\n" + "st1 { v12.s }[2], [x21], #0x4\n" + "tbz x20, #1, 29f\n" + "st1 { v12.h }[6], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[14], [x21]\n" + "b 52f\n" + "29:" // 4 rounds: Partial writeback: partial_1_44 + "tbz x20, #0, 52f\n" + "st1 { v12.b }[12], [x21]\n" + "b 52f\n" + "30:" // 4 rounds: Partial writeback: partial_2_40 + "tbz x20, #1, 31f\n" + "st1 { v12.h }[4], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[10], [x21]\n" + "b 52f\n" + "31:" // 4 rounds: Partial writeback: partial_1_40 + "tbz x20, #0, 52f\n" + "st1 { v12.b }[8], [x21]\n" + "b 52f\n" + "32:" // 4 rounds: Partial writeback: partial_4_32 + "tbz x20, #2, 34f\n" + "str s12, [x21], #0x4\n" + "tbz x20, #1, 33f\n" + "st1 { v12.h }[2], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[6], [x21]\n" + "b 52f\n" + "33:" // 4 rounds: Partial writeback: partial_1_36 + "tbz x20, #0, 52f\n" + "st1 { v12.b }[4], [x21]\n" + "b 52f\n" + "34:" // 4 rounds: Partial writeback: partial_2_32 + "tbz x20, #1, 35f\n" + "str h12, [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[2], [x21]\n" + "b 52f\n" + "35:" // 4 rounds: Partial writeback: partial_1_32 + "tbz x20, #0, 52f\n" + "str b12, [x21, #0x0]\n" + "b 52f\n" + "36:" // 4 rounds: Partial writeback: partial_16_0 + "tbz x20, #4, 44f\n" + "st1 { v8.16b }, [x21], #0x10\n" + "tbz x20, #3, 40f\n" + "str d13, [x21], #0x8\n" + "tbz x20, #2, 38f\n" + "st1 { v13.s }[2], [x21], #0x4\n" + "tbz x20, #1, 37f\n" + "st1 { v13.h }[6], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[14], [x21]\n" + "b 52f\n" + "37:" // 4 rounds: Partial writeback: partial_1_28 + "tbz x20, #0, 52f\n" + "st1 { v13.b }[12], [x21]\n" + "b 52f\n" + "38:" // 4 rounds: Partial writeback: partial_2_24 + "tbz x20, #1, 39f\n" + "st1 { v13.h }[4], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[10], [x21]\n" + "b 52f\n" + "39:" // 4 rounds: Partial writeback: partial_1_24 + "tbz x20, #0, 52f\n" + "st1 { v13.b }[8], [x21]\n" + "b 52f\n" + "40:" // 4 rounds: Partial writeback: partial_4_16 + "tbz x20, #2, 42f\n" + "str s13, [x21], #0x4\n" + "tbz x20, #1, 41f\n" + "st1 { v13.h }[2], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[6], [x21]\n" + "b 52f\n" + "41:" // 4 rounds: Partial writeback: partial_1_20 + "tbz x20, #0, 52f\n" + "st1 { v13.b }[4], [x21]\n" + "b 52f\n" + "42:" // 4 rounds: Partial writeback: partial_2_16 + "tbz x20, #1, 43f\n" + "str h13, [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[2], [x21]\n" + "b 52f\n" + "43:" // 4 rounds: Partial writeback: partial_1_16 + "tbz x20, #0, 52f\n" + "str b13, [x21, #0x0]\n" + "b 52f\n" + "44:" // 4 rounds: Partial writeback: partial_8_0 + "tbz x20, #3, 48f\n" + "str d8, [x21], #0x8\n" + "tbz x20, #2, 46f\n" + "st1 { v8.s }[2], [x21], #0x4\n" + "tbz x20, #1, 45f\n" + "st1 { v8.h }[6], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[14], [x21]\n" + "b 52f\n" + "45:" // 4 rounds: Partial writeback: partial_1_12 + "tbz x20, #0, 52f\n" + "st1 { v8.b }[12], [x21]\n" + "b 52f\n" + "46:" // 4 rounds: Partial writeback: partial_2_8 + "tbz x20, #1, 47f\n" + "st1 { v8.h }[4], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[10], [x21]\n" + "b 52f\n" + "47:" // 4 rounds: Partial writeback: partial_1_8 + "tbz x20, #0, 52f\n" + "st1 { v8.b }[8], [x21]\n" + "b 52f\n" + "48:" // 4 rounds: Partial writeback: partial_4_0 + "tbz x20, #2, 50f\n" + "str s8, [x21], #0x4\n" + "tbz x20, #1, 49f\n" + "st1 { v8.h }[2], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[6], [x21]\n" + "b 52f\n" + "49:" // 4 rounds: Partial writeback: partial_1_4 + "tbz x20, #0, 52f\n" + "st1 { v8.b }[4], [x21]\n" + "b 52f\n" + "50:" // 4 rounds: Partial writeback: partial_2_0 + "tbz x20, #1, 51f\n" + "str h8, [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[2], [x21]\n" + "b 52f\n" + "51:" // 4 rounds: Partial writeback: partial_1_0 + "str b8, [x21, #0x0]\n" + "52:" // 4 rounds: Partial writeback: Done + "b 54f\n" + "53:" // 4 rounds: Full writeback + "str q8, [x21, #0x0]\n" + "str q13, [x21, #0x10]\n" + "str q12, [x21, #0x20]\n" + "add x21, x21, #0x30\n" + "54:" // 4 rounds: Writeback done + "subs x20, x20, #0x30\n" + "bgt 2b\n" + "add x23, x23, #0x1\n" + "cmp x23, %x[num_strings]\n" + "bne 1b\n" + : + : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), + [string_length] "r"(string_length), [table] "r"(table) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"); } #endif // __aarch64__ diff --git a/src/cpu/kernels/lut/generic/sve2/u8.cpp b/src/cpu/kernels/lut/generic/sve2/u8.cpp index b80d75326e..ee8572703e 100644 --- a/src/cpu/kernels/lut/generic/sve2/u8.cpp +++ b/src/cpu/kernels/lut/generic/sve2/u8.cpp @@ -32,11 +32,7 @@ namespace arm_compute namespace cpu { void lut_u8_sve2( - const uint8_t *table, - size_t num_strings, - size_t string_length, - const uint8_t *const *input, - uint8_t *const *output) + const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output) { __asm__ __volatile__( "ptrue p0.b\n" @@ -636,7 +632,9 @@ void lut_u8_sve2( "bne 2b\n" : [table] "+&r"(table) : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", + "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", + "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); } } // namespace cpu diff --git a/src/cpu/kernels/lut/list.h b/src/cpu/kernels/lut/list.h index 7a2afc6927..da90346267 100644 --- a/src/cpu/kernels/lut/list.h +++ b/src/cpu/kernels/lut/list.h @@ -34,13 +34,9 @@ namespace cpu { #ifdef __aarch64__ -#define DECLARE_LUT_KERNEL(func_name) \ - void func_name( \ - const uint8_t *table, \ - size_t num_strings, \ - size_t string_length, \ - const uint8_t *const *input, \ - uint8_t *const *output) +#define DECLARE_LUT_KERNEL(func_name) \ + void func_name(const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, \ + uint8_t *const *output) DECLARE_LUT_KERNEL(lut_u8_neon); DECLARE_LUT_KERNEL(lut_u8_sve2); diff --git a/src/cpu/kernels/maxunpool/generic/neon/impl.h b/src/cpu/kernels/maxunpool/generic/neon/impl.h index 5fe19c4707..73a5b86a2f 100644 --- a/src/cpu/kernels/maxunpool/generic/neon/impl.h +++ b/src/cpu/kernels/maxunpool/generic/neon/impl.h @@ -25,6 +25,7 @@ #define ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -37,13 +38,15 @@ void max_unpooling(const ITensor *input, const ITensor *indices, ITensor *output Iterator indices_itr(indices, window); auto out_ptr = reinterpret_cast(output->buffer()); const int out_stride_w = static_cast(output->info()->strides_in_bytes()[3]); - execute_window_loop(window, [&](const Coordinates & id) - { - auto vindices = reinterpret_cast(indices_itr.ptr()); - auto vinput = reinterpret_cast(input_itr.ptr()); - out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput; - }, - input_itr, indices_itr); + execute_window_loop( + window, + [&](const Coordinates &id) + { + auto vindices = reinterpret_cast(indices_itr.ptr()); + auto vinput = reinterpret_cast(input_itr.ptr()); + out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput; + }, + input_itr, indices_itr); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp index 96e4030268..6470f391e2 100644 --- a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp @@ -23,9 +23,9 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" #include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" namespace arm_compute { @@ -45,64 +45,66 @@ void mean_stddev_normalization(ITensor *input, ITensor *output, fl Iterator input_itr(input, win); Iterator output_itr(output, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - auto in_ptr = reinterpret_cast(input_itr.ptr()); - auto out_ptr = reinterpret_cast(output_itr.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast(input_itr.ptr()); + auto out_ptr = reinterpret_cast(output_itr.ptr()); - float16x8_t sum_vec = vdupq_n_f16(static_cast(0.0f)); - float32x4_t sum_sq_vec = vdupq_n_f32(0.0f); + float16x8_t sum_vec = vdupq_n_f16(static_cast(0.0f)); + float32x4_t sum_sq_vec = vdupq_n_f32(0.0f); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - float16x8_t data = vld1q_f16(in_ptr + x); - sum_vec = vaddq_f16(sum_vec, data); - float32x4_t dl = vcvt_f32_f16(vget_low_f16(data)); - float32x4_t dh = vcvt_f32_f16(vget_high_f16(data)); - sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl)); - sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh)); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + float16x8_t data = vld1q_f16(in_ptr + x); + sum_vec = vaddq_f16(sum_vec, data); + float32x4_t dl = vcvt_f32_f16(vget_low_f16(data)); + float32x4_t dh = vcvt_f32_f16(vget_high_f16(data)); + sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl)); + sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh)); + } - float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec)); - sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); - sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); + float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec)); + sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); + sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); - float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec); - sum_sq_carry_res = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res); + float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec); + sum_sq_carry_res = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res); - float16_t sum = vget_lane_f16(sum_carry_res, 0); - float sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0); + float16_t sum = vget_lane_f16(sum_carry_res, 0); + float sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - float16_t data = *(in_ptr + x); - sum += data; - float fdata = static_cast(data); - sum_sq += fdata * fdata; - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + float16_t data = *(in_ptr + x); + sum += data; + float fdata = static_cast(data); + sum_sq += fdata * fdata; + } - float16_t mean = sum / input->info()->dimension(0); - float var = (sum_sq / input->info()->dimension(0)) - (mean * mean); - float16_t stddev_inv = static_cast(1.f / sqrt(var + epsilon)); + float16_t mean = sum / input->info()->dimension(0); + float var = (sum_sq / input->info()->dimension(0)) - (mean * mean); + float16_t stddev_inv = static_cast(1.f / sqrt(var + epsilon)); - float16x8_t mean_vec = vdupq_n_f16(mean); - float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv); + float16x8_t mean_vec = vdupq_n_f16(mean); + float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv); - for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) - { - float16x8_t data = vld1q_f16(in_ptr + x); - float16x8_t res = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec); - // Store results - vst1q_f16(out_ptr + x, res); - } - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; - } - }, - input_itr, output_itr); + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + float16x8_t data = vld1q_f16(in_ptr + x); + float16x8_t res = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec); + // Store results + vst1q_f16(out_ptr + x, res); + } + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; + } + }, + input_itr, output_itr); } void neon_fp16_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window) diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp index 0522d6e277..11f6294a35 100644 --- a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp @@ -23,6 +23,7 @@ */ #include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -45,60 +46,62 @@ void mean_stddev_normalization(ITensor *input, ITensor *output, float epsilon, c Iterator input_itr(input, win); Iterator output_itr(output, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - auto in_ptr = reinterpret_cast(input_itr.ptr()); - auto out_ptr = reinterpret_cast(output_itr.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast(input_itr.ptr()); + auto out_ptr = reinterpret_cast(output_itr.ptr()); - auto sum_vec = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - auto sum_sq_vec = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + auto sum_vec = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + auto sum_sq_vec = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto data = wrapper::vloadq(in_ptr + x); - sum_vec = wrapper::vadd(sum_vec, data); - sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data)); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto data = wrapper::vloadq(in_ptr + x); + sum_vec = wrapper::vadd(sum_vec, data); + sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data)); + } - auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec)); - auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec)); - for(int i = 0; i < size / 4; ++i) - { - sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res); - sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res); - } + auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec)); + auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec)); + for (int i = 0; i < size / 4; ++i) + { + sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res); + sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res); + } - auto sum = wrapper::vgetlane(sum_carry_res, 0); - auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0); + auto sum = wrapper::vgetlane(sum_carry_res, 0); + auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - ScalarType data = *(in_ptr + x); - sum += data; - sum_sq += data * data; - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + ScalarType data = *(in_ptr + x); + sum += data; + sum_sq += data * data; + } - ScalarType mean = sum / input->info()->dimension(0); - ScalarType var = (sum_sq / input->info()->dimension(0)) - (mean * mean); - ScalarType stddev_inv = 1.f / sqrt(var + epsilon); + ScalarType mean = sum / input->info()->dimension(0); + ScalarType var = (sum_sq / input->info()->dimension(0)) - (mean * mean); + ScalarType stddev_inv = 1.f / sqrt(var + epsilon); - auto mean_vec = wrapper::vdup_n(mean, ExactTagType{}); - auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{}); - for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto data = wrapper::vloadq(in_ptr + x); - auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec); - // Store results - wrapper::vstore(out_ptr + x, res); - } - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; - } - }, - input_itr, output_itr); + auto mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{}); + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto data = wrapper::vloadq(in_ptr + x); + auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec); + // Store results + wrapper::vstore(out_ptr + x, res); + } + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; + } + }, + input_itr, output_itr); } template void mean_stddev_normalization(ITensor *input, ITensor *output, float epsilon, const Window &window); } // namespace cpu diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp index 53af1e4b16..32654df5dc 100644 --- a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" + #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -69,77 +70,76 @@ void neon_qasymm8_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const float32x4_t quant_min_vec = vdupq_n_f32(0.0f); execute_window_loop( - win, [&](const Coordinates &) - { - int x = window_start_x; - auto in_ptr = reinterpret_cast(input_itr.ptr()); - auto out_ptr = reinterpret_cast(output_itr.ptr()); + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast(input_itr.ptr()); + auto out_ptr = reinterpret_cast(output_itr.ptr()); - uint32x4_t sum_vec = vdupq_n_u32(0); - uint32x4_t sum_sq_vec = vdupq_n_u32(0); + uint32x4_t sum_vec = vdupq_n_u32(0); + uint32x4_t sum_sq_vec = vdupq_n_u32(0); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t data = vld1q_u8(in_ptr + x); - sum_vec = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data))); - const uint16x8_t squares_low = vmull_u8(vget_low_u8(data), vget_low_u8(data)); - const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data)); - sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high))); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t data = vld1q_u8(in_ptr + x); + sum_vec = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data))); + const uint16x8_t squares_low = vmull_u8(vget_low_u8(data), vget_low_u8(data)); + const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data)); + sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high))); + } #ifdef __aarch64__ - sum_vec = vpaddq_u32(sum_vec, sum_vec); - sum_vec = vpaddq_u32(sum_vec, sum_vec); - uint32_t sum = vgetq_lane_u32(sum_vec, 0); - sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); - sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); - uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0); + sum_vec = vpaddq_u32(sum_vec, sum_vec); + sum_vec = vpaddq_u32(sum_vec, sum_vec); + uint32_t sum = vgetq_lane_u32(sum_vec, 0); + sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); + sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); + uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0); #elif __arm__ // #ifdef __aarch64__ - uint32_t sum = vgetq_lane_u32(sum_vec, 0) + - vgetq_lane_u32(sum_vec, 1) + - vgetq_lane_u32(sum_vec, 2) + - vgetq_lane_u32(sum_vec, 3); + uint32_t sum = vgetq_lane_u32(sum_vec, 0) + vgetq_lane_u32(sum_vec, 1) + vgetq_lane_u32(sum_vec, 2) + + vgetq_lane_u32(sum_vec, 3); - uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + - vgetq_lane_u32(sum_sq_vec, 1) + - vgetq_lane_u32(sum_sq_vec, 2) + - vgetq_lane_u32(sum_sq_vec, 3); + uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + vgetq_lane_u32(sum_sq_vec, 1) + + vgetq_lane_u32(sum_sq_vec, 2) + vgetq_lane_u32(sum_sq_vec, 3); #endif // #ifdef __aarch64__ - for(; x < window_end_x; ++x) - { - auto data = static_cast(*(in_ptr + x)); - sum += data; - sum_sq += (data * data); - } + for (; x < window_end_x; ++x) + { + auto data = static_cast(*(in_ptr + x)); + sum += data; + sum_sq += (data * data); + } - const float mean = (static_cast(sum) / static_cast(input->info()->dimension(0))); - const float var = (static_cast(sum_sq) / static_cast(input->info()->dimension(0))) - (mean * mean); - const float stdev_inv = 1.0f / sqrtf(var + epsilon); - const float32x4_t v_scale = vdupq_n_f32(stdev_inv * output_inv_scale); - const float32x4_t v_offset = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset); - for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t data = vld1q_u8(in_ptr + x); - float32x4_t db1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data))))); - float32x4_t db2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data))))); - float32x4_t db3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data))))); - float32x4_t db4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data))))); - db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec); - db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec); - db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec); - db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec); - const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4)); - vst1q_u8(out_ptr + x, out); - } + const float mean = (static_cast(sum) / static_cast(input->info()->dimension(0))); + const float var = + (static_cast(sum_sq) / static_cast(input->info()->dimension(0))) - (mean * mean); + const float stdev_inv = 1.0f / sqrtf(var + epsilon); + const float32x4_t v_scale = vdupq_n_f32(stdev_inv * output_inv_scale); + const float32x4_t v_offset = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset); + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t data = vld1q_u8(in_ptr + x); + float32x4_t db1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data))))); + float32x4_t db2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data))))); + float32x4_t db3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data))))); + float32x4_t db4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data))))); + db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec); + db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec); + db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec); + db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec); + const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4)); + vst1q_u8(out_ptr + x, out); + } - for(; x < window_end_x; ++x) - { - auto data = static_cast(*(in_ptr + x)); - const uint8_t res = data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset); - *(out_ptr + x) = res; - } - }, - input_itr, output_itr); + for (; x < window_end_x; ++x) + { + auto data = static_cast(*(in_ptr + x)); + const uint8_t res = + data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset); + *(out_ptr + x) = res; + } + }, + input_itr, output_itr); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp index 4e15d3ad3f..4af59c2ad4 100644 --- a/src/cpu/kernels/pool2d/neon/fp16.cpp +++ b/src/cpu/kernels/pool2d/neon/fp16.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool2d/neon/list.h" #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) @@ -37,7 +38,12 @@ namespace cpu { namespace { -void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_f16_maxpool_indices(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { const int window_start_x = window.x().start(); const int window_end_x = window.x().end(); @@ -53,8 +59,8 @@ void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds const int pool_pad_top = pool_info.pad_stride_info.pad_top(); const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - int pool_stride_x = 0; - int pool_stride_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int pad_right = src->info()->padding().right; @@ -63,97 +69,114 @@ void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds const int in_stride_y = static_cast(src->info()->strides_in_bytes().y()); const int in_stride_z = static_cast(src->info()->strides_in_bytes().z()); - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); - const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) - { - const auto in_x0_ptr = reinterpret_cast(in.ptr() + in_x0_offset) + x_off; - const auto in_x1_ptr = reinterpret_cast(in.ptr() + in_x1_offset) + x_off; - const auto in_x2_ptr = reinterpret_cast(in.ptr() + in_x2_offset) + x_off; - const auto in_x3_ptr = reinterpret_cast(in.ptr() + in_x3_offset) + x_off; - const auto v_x0 = vld1q_f16(in_x0_ptr); - const auto v_x1 = vld1q_f16(in_x1_ptr); - const auto v_x2 = vld1q_f16(in_x2_ptr); - const auto v_x3 = vld1q_f16(in_x3_ptr); - float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1)); - // Store result - vst1q_f16(reinterpret_cast(out.ptr()) + x_off, vres); - - const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; - const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32x4_t voffset_x0_0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 }; - const uint32x4_t voffset_x0_1 = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 }; - const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1)); - const uint32x4_t voffset_x1_0 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 }; - const uint32x4_t voffset_x1_1 = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 }; - const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1)); - const uint32x4_t voffset_x2_0 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 }; - const uint32x4_t voffset_x2_1 = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 }; - const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1)); - const uint32x4_t voffset_x3_0 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 }; - const uint32x4_t voffset_x3_1 = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 }; - const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1)); - const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1); - const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3); - const uint16x8_t tmp_indices2 = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1); - const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2)); - const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2)); - // Store indicies - vst1q_u32(reinterpret_cast(indices.ptr()) + x_off, tmp_indeces3_0); - vst1q_u32(reinterpret_cast(indices.ptr() + 16) + x_off, tmp_indeces3_1); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - const auto x0 = *(reinterpret_cast(in.ptr() + in_x0_offset) + x_off); - const auto x1 = *(reinterpret_cast(in.ptr() + in_x1_offset) + x_off); - const auto x2 = *(reinterpret_cast(in.ptr() + in_x2_offset) + x_off); - const auto x3 = *(reinterpret_cast(in.ptr() + in_x3_offset) + x_off); - float16_t res = std::max(std::max(x2, x3), std::max(x0, x1)); - - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - - const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; - const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; - const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; - const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; - - // Store indices - *(reinterpret_cast(indices.ptr()) + x_off) = tmp_idx2; - } - }, - in, out, indices); -} -} + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int in_x0_offset = + (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); + const int in_x1_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); + const int in_x2_offset = + (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); + const int in_x3_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + const auto in_x0_ptr = reinterpret_cast(in.ptr() + in_x0_offset) + x_off; + const auto in_x1_ptr = reinterpret_cast(in.ptr() + in_x1_offset) + x_off; + const auto in_x2_ptr = reinterpret_cast(in.ptr() + in_x2_offset) + x_off; + const auto in_x3_ptr = reinterpret_cast(in.ptr() + in_x3_offset) + x_off; + const auto v_x0 = vld1q_f16(in_x0_ptr); + const auto v_x1 = vld1q_f16(in_x1_ptr); + const auto v_x2 = vld1q_f16(in_x2_ptr); + const auto v_x3 = vld1q_f16(in_x3_ptr); + float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1)); + // Store result + vst1q_f16(reinterpret_cast(out.ptr()) + x_off, vres); + + const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; + const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - + pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32x4_t voffset_x0_0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3}; + const uint32x4_t voffset_x0_1 = {offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7}; + const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1)); + const uint32x4_t voffset_x1_0 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3}; + const uint32x4_t voffset_x1_1 = {offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7}; + const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1)); + const uint32x4_t voffset_x2_0 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3}; + const uint32x4_t voffset_x2_1 = {offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7}; + const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1)); + const uint32x4_t voffset_x3_0 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3}; + const uint32x4_t voffset_x3_1 = {offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7}; + const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1)); + const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1); + const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3); + const uint16x8_t tmp_indices2 = + vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1); + const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2)); + const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2)); + // Store indicies + vst1q_u32(reinterpret_cast(indices.ptr()) + x_off, tmp_indeces3_0); + vst1q_u32(reinterpret_cast(indices.ptr() + 16) + x_off, tmp_indeces3_1); + } -void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + const auto x0 = *(reinterpret_cast(in.ptr() + in_x0_offset) + x_off); + const auto x1 = *(reinterpret_cast(in.ptr() + in_x1_offset) + x_off); + const auto x2 = *(reinterpret_cast(in.ptr() + in_x2_offset) + x_off); + const auto x3 = *(reinterpret_cast(in.ptr() + in_x3_offset) + x_off); + float16_t res = std::max(std::max(x2, x3), std::max(x0, x1)); + + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; + + const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; + const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - + pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; + const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; + const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; + + // Store indices + *(reinterpret_cast(indices.ptr()) + x_off) = tmp_idx2; + } + }, + in, out, indices); +} +} // namespace + +void poolingMxN_fp16_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { - if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1) + if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1) { pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); } @@ -167,151 +190,172 @@ void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, Iterator in(src, window_src); Iterator out(dst0, window_out); - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); const float16_t min_value = get_initial_min(pool_info.use_inf_as_limit); float16x8_t vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - if(pool_info.pool_type != PoolingType::MAX) + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float16x8_t scale_v = vdupq_n_f16(scale); - - // Perform pooling - vres = vdupq_n_f16(0.0f); - for(int y = pool_start_y; y < pool_end_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = pool_start_x; x < pool_end_x; ++x) + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float16x8_t scale_v = vdupq_n_f16(scale); + + // Perform pooling + vres = vdupq_n_f16(0.0f); + for (int y = pool_start_y; y < pool_end_y; ++y) { - const float16x8_t data = vld1q_f16(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) + for (int x = pool_start_x; x < pool_end_x; ++x) { - vres = vaddq_f16(vres, vmulq_f16(data, data)); + const float16x8_t data = vld1q_f16( + reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + vres = vaddq_f16(vres, vmulq_f16(data, data)); + } + else + { + vres = vaddq_f16(vres, data); + } } - else + } + // Divide by scale + vres = vmulq_f16(vres, scale_v); + } + else + { + vres = vdupq_n_f16(min_value); + + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) { - vres = vaddq_f16(vres, data); + const float16x8_t data = vld1q_f16( + reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + vres = vmaxq_f16(vres, data); } } } - // Divide by scale - vres = vmulq_f16(vres, scale_v); - } - else - { - vres = vdupq_n_f16(min_value); - for(int y = pool_start_y; y < pool_end_y; ++y) + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float16x8_t data = vld1q_f16(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - vres = vmaxq_f16(vres, data); - } + float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres); + vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), + sqrt_reciprocal)); } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres); - vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal)); + // Store result + vst1q_f16(reinterpret_cast(out.ptr()) + x_off, vres); } - // Store result - vst1q_f16(reinterpret_cast(out.ptr()) + x_off, vres); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - float16_t res = 0.0f; - - if(pool_info.pool_type != PoolingType::MAX) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - // Calculate scale - const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + float16_t res = 0.0f; - for(int y = pool_start_y; y < pool_end_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); + // Calculate scale + const float16_t scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) { - res += data * data; + const float data = + *(reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + res += data * data; + } + else + { + res += data; + } } - else + } + + // Divide by scale + res *= scale; + } + else + { + res = min_value; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) { - res += data; + const float16_t data = + *(reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + res = std::max(res, data); } } } - // Divide by scale - res *= scale; - } - else - { - res = min_value; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float16_t data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - res = std::max(res, data); - } + res = std::sqrt(res); } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; } - - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - }, - in, out); + }, + in, out); } } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp index a400f3a95d..aaa37863cb 100644 --- a/src/cpu/kernels/pool2d/neon/fp32.cpp +++ b/src/cpu/kernels/pool2d/neon/fp32.cpp @@ -24,8 +24,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool2d/neon/list.h" namespace arm_compute @@ -34,7 +35,12 @@ namespace cpu { namespace { -void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_f32_maxpool_indices(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { const int window_start_x = window.x().start(); const int window_end_x = window.x().end(); @@ -50,8 +56,8 @@ void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds const int pool_pad_top = pool_info.pad_stride_info.pad_top(); const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - int pool_stride_x = 0; - int pool_stride_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); float32x4_t vres; @@ -63,89 +69,102 @@ void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds const int in_stride_y = static_cast(src->info()->strides_in_bytes().y()); const int in_stride_z = static_cast(src->info()->strides_in_bytes().z()); - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - - const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); - const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) - { - const auto in_x0_ptr = reinterpret_cast(in.ptr() + in_x0_offset); - const auto in_x1_ptr = reinterpret_cast(in.ptr() + in_x1_offset); - const auto in_x2_ptr = reinterpret_cast(in.ptr() + in_x2_offset); - const auto in_x3_ptr = reinterpret_cast(in.ptr() + in_x3_offset); - const auto v_x0 = vld1q_f32(in_x0_ptr + x_off); - const auto v_x1 = vld1q_f32(in_x1_ptr + x_off); - const auto v_x2 = vld1q_f32(in_x2_ptr + x_off); - const auto v_x3 = vld1q_f32(in_x3_ptr + x_off); - vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1)); - // Store result - vst1q_f32(reinterpret_cast(out.ptr()) + x_off, vres); - - const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = offset_base / sizeof(float) + x_off; - const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32_t offset_x2 = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 }; - const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 }; - const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 }; - const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 }; - const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1); - const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3); - const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1); - - // Store indices - vst1q_u32(reinterpret_cast(indices.ptr()) + x_off, tmp_indices2); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - const auto x0 = *(reinterpret_cast(in.ptr() + in_x0_offset) + x_off); - const auto x1 = *(reinterpret_cast(in.ptr() + in_x1_offset) + x_off); - const auto x2 = *(reinterpret_cast(in.ptr() + in_x2_offset) + x_off); - const auto x3 = *(reinterpret_cast(in.ptr() + in_x3_offset) + x_off); - res = std::max(std::max(x2, x3), std::max(x0, x1)); - - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - - const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = offset_base / sizeof(float) + x_off; - const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32_t offset_x2 = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; - const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; - const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; - - // Store indices - *(reinterpret_cast(indices.ptr()) + x_off) = tmp_idx2; - } - }, - in, out, indices); + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + + const int in_x0_offset = + (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); + const int in_x1_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); + const int in_x2_offset = + (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); + const int in_x3_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + const auto in_x0_ptr = reinterpret_cast(in.ptr() + in_x0_offset); + const auto in_x1_ptr = reinterpret_cast(in.ptr() + in_x1_offset); + const auto in_x2_ptr = reinterpret_cast(in.ptr() + in_x2_offset); + const auto in_x3_ptr = reinterpret_cast(in.ptr() + in_x3_offset); + const auto v_x0 = vld1q_f32(in_x0_ptr + x_off); + const auto v_x1 = vld1q_f32(in_x1_ptr + x_off); + const auto v_x2 = vld1q_f32(in_x2_ptr + x_off); + const auto v_x3 = vld1q_f32(in_x3_ptr + x_off); + vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1)); + // Store result + vst1q_f32(reinterpret_cast(out.ptr()) + x_off, vres); + + const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = offset_base / sizeof(float) + x_off; + const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t offset_x2 = + offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32x4_t voffset_x0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3}; + const uint32x4_t voffset_x1 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3}; + const uint32x4_t voffset_x2 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3}; + const uint32x4_t voffset_x3 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3}; + const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1); + const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3); + const uint32x4_t tmp_indices2 = + vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1); + + // Store indices + vst1q_u32(reinterpret_cast(indices.ptr()) + x_off, tmp_indices2); + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + const auto x0 = *(reinterpret_cast(in.ptr() + in_x0_offset) + x_off); + const auto x1 = *(reinterpret_cast(in.ptr() + in_x1_offset) + x_off); + const auto x2 = *(reinterpret_cast(in.ptr() + in_x2_offset) + x_off); + const auto x3 = *(reinterpret_cast(in.ptr() + in_x3_offset) + x_off); + res = std::max(std::max(x2, x3), std::max(x0, x1)); + + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; + + const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = offset_base / sizeof(float) + x_off; + const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t offset_x2 = + offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; + const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; + const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; + + // Store indices + *(reinterpret_cast(indices.ptr()) + x_off) = tmp_idx2; + } + }, + in, out, indices); } } // namespace -void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window) +void poolingMxN_fp32_neon_nhwc_kernel_indices( + const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window) { - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); constexpr int window_step_x = 4; Window window_out = window; @@ -160,8 +179,8 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, const int pool_pad_top = pool_info.pad_stride_info.pad_top(); const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - int pool_stride_x = 0; - int pool_stride_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const float min_value = get_initial_min(pool_info.use_inf_as_limit); @@ -169,9 +188,9 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, float32x4_t vres; uint32x4_t vidx; - constexpr int idx_width = 1; - constexpr int idx_height = 2; - constexpr int idx_batch = 3; + constexpr int idx_width = 1; + constexpr int idx_height = 2; + constexpr int idx_batch = 3; const int y_stride = static_cast(src->info()->strides_in_bytes().y()); const int z_stride = static_cast(src->info()->strides_in_bytes().z()); @@ -182,89 +201,97 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; - const int idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + const int idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; + const int idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; - const int pool_start_x = std::max(0, -idx_width); - const int pool_start_y = std::max(0, -idx_height); + const int pool_start_x = std::max(0, -idx_width); + const int pool_start_y = std::max(0, -idx_height); - const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width); - const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height); + const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width); + const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height); - const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride; + const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride; - const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride); - const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride); + const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride); + const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride); - int x_off = window_start_x; + int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) - { - vres = vdupq_n_f32(min_value); - vidx = vdupq_n_u32(0U); - const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset; - uint32_t curr_kernel_index = pool_size_x * pool_start_y; - for(int y = pool_start_y; y < pool_end_y; ++y) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { - const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float)); - curr_kernel_index += pool_start_x; - for(int x = pool_start_x; x < pool_end_x; ++x) + vres = vdupq_n_f32(min_value); + vidx = vdupq_n_u32(0U); + const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset; + uint32_t curr_kernel_index = pool_size_x * pool_start_y; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const float32x4_t data = vld1q_f32(reinterpret_cast(in_ptr_x)); - const uint32x4_t vidx_curr = vdupq_n_u32(curr_kernel_index); - const uint32x4_t idxMask = vcgtq_f32(data, vres); - vidx = vbslq_u32(idxMask, vidx_curr, vidx); - vres = vmaxq_f32(vres, data); - in_ptr_x += y_stride; - curr_kernel_index++; + const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float)); + curr_kernel_index += pool_start_x; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float32x4_t data = vld1q_f32(reinterpret_cast(in_ptr_x)); + const uint32x4_t vidx_curr = vdupq_n_u32(curr_kernel_index); + const uint32x4_t idxMask = vcgtq_f32(data, vres); + vidx = vbslq_u32(idxMask, vidx_curr, vidx); + vres = vmaxq_f32(vres, data); + in_ptr_x += y_stride; + curr_kernel_index++; + } + curr_kernel_index += (pool_size_x - pool_end_x); + in_ptr_y += z_stride; } - curr_kernel_index += (pool_size_x - pool_end_x); - in_ptr_y += z_stride; + // Store result + vst1q_f32(reinterpret_cast(out.ptr()) + x_off, vres); + vst1q_u32(reinterpret_cast(indices.ptr()) + x_off, vidx); } - // Store result - vst1q_f32(reinterpret_cast(out.ptr()) + x_off, vres); - vst1q_u32(reinterpret_cast(indices.ptr()) + x_off, vidx); - } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - float res = min_value; - uint32_t idx = 0U; - const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float)); - for(int x = pool_start_x; x < pool_end_x; ++x) + float res = min_value; + uint32_t idx = 0U; + const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const float data = *(reinterpret_cast(in_ptr_x)); - if(data > res) + const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float)); + for (int x = pool_start_x; x < pool_end_x; ++x) { - idx = pool_size_x * y + x; - res = data; + const float data = *(reinterpret_cast(in_ptr_x)); + if (data > res) + { + idx = pool_size_x * y + x; + res = data; + } + in_ptr_x += y_stride; } - in_ptr_x += y_stride; + in_ptr_y += z_stride; } - in_ptr_y += z_stride; - } - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - *(reinterpret_cast(indices.ptr()) + x_off) = idx; - } - }, - out, indices); + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; + *(reinterpret_cast(indices.ptr()) + x_off) = idx; + } + }, + out, indices); } -void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_fp32_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { - if((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr)) + if ((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr)) { poolingMxN_fp32_neon_nhwc_kernel_indices(src, dst0, dst1, pool_info, window); } - else if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr)) + else if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && + !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr)) { pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); } @@ -280,153 +307,174 @@ void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, Iterator in(src, window_src); Iterator out(dst0, window_out); - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_size_x = + pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = + pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); const float min_value = get_initial_min(pool_info.use_inf_as_limit); float32x4_t vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - if(pool_info.pool_type != PoolingType::MAX) + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float32x4_t scale_v = vdupq_n_f32(scale); + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float32x4_t scale_v = vdupq_n_f32(scale); - // Perform pooling - vres = vdupq_n_f32(0.0f); + // Perform pooling + vres = vdupq_n_f32(0.0f); - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) + for (int y = pool_start_y; y < pool_end_y; ++y) { - const float32x4_t data = vld1q_f32(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) - { - vres = vmlaq_f32(vres, data, data); - } - else + for (int x = pool_start_x; x < pool_end_x; ++x) { - vres = vaddq_f32(vres, data); + const float32x4_t data = vld1q_f32( + reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + vres = vmlaq_f32(vres, data, data); + } + else + { + vres = vaddq_f32(vres, data); + } } } + // Divide by scale + vres = vmulq_f32(vres, scale_v); } - // Divide by scale - vres = vmulq_f32(vres, scale_v); - } - else - { - vres = vdupq_n_f32(min_value); - for(int y = pool_start_y; y < pool_end_y; ++y) + else { - for(int x = pool_start_x; x < pool_end_x; ++x) + vres = vdupq_n_f32(min_value); + for (int y = pool_start_y; y < pool_end_y; ++y) { - const float32x4_t data = vld1q_f32(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - vres = vmaxq_f32(vres, data); + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float32x4_t data = vld1q_f32( + reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + vres = vmaxq_f32(vres, data); + } } } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - float32x4_t l2_res = { static_cast(sqrt(vgetq_lane_f32(vres, 0))), - static_cast(sqrt(vgetq_lane_f32(vres, 1))), - static_cast(sqrt(vgetq_lane_f32(vres, 2))), - static_cast(sqrt(vgetq_lane_f32(vres, 3))) - }; - vres = l2_res; - } - - // Store result - vst1q_f32(reinterpret_cast(out.ptr()) + x_off, vres); - } + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + float32x4_t l2_res = {static_cast(sqrt(vgetq_lane_f32(vres, 0))), + static_cast(sqrt(vgetq_lane_f32(vres, 1))), + static_cast(sqrt(vgetq_lane_f32(vres, 2))), + static_cast(sqrt(vgetq_lane_f32(vres, 3)))}; + vres = l2_res; + } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - float res = 0.0f; + // Store result + vst1q_f32(reinterpret_cast(out.ptr()) + x_off, vres); + } - if(pool_info.pool_type != PoolingType::MAX) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + float res = 0.0f; - for(int y = pool_start_y; y < pool_end_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) { - res += data * data; + const float data = + *(reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + res += data * data; + } + else + { + res += data; + } } - else + } + + // Divide by scale + res *= scale; + } + else + { + res = min_value; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) { - res += data; + const float data = + *(reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + res = std::max(res, data); } } } - // Divide by scale - res *= scale; - } - else - { - res = min_value; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - res = std::max(res, data); - } + res = std::sqrt(res); } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; } - - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - }, - in, out); + }, + in, out); } } } // namespace cpu diff --git a/src/cpu/kernels/pool2d/neon/list.h b/src/cpu/kernels/pool2d/neon/list.h index eb141d6fcd..f8f458a63e 100644 --- a/src/cpu/kernels/pool2d/neon/list.h +++ b/src/cpu/kernels/pool2d/neon/list.h @@ -26,16 +26,19 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/pool2d/neon/quantized.h" + #include namespace arm_compute { namespace cpu { -#define DECLARE_POOLING_KERNEL(func_name) \ - void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window) +#define DECLARE_POOLING_KERNEL(func_name) \ + void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, \ + const Window &window) DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc); DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc); @@ -65,7 +68,12 @@ T get_initial_min(bool use_inf_as_limit) } template -inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout) +inline uint32_t offset_no_padding(uint32_t padded_offset, + const Coordinates &id, + const ITensorInfo &info, + int pool_stride_x, + int pool_stride_y, + DataLayout data_layout) { const int pad_left = info.padding().left; const int pad_right = info.padding().right; @@ -76,22 +84,24 @@ inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const int pad_horiz = pad_left + pad_right; const int pad_vert = pad_top + pad_bottom; - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - const uint32_t offset_base = padded_offset - - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */ - - pad_top * sizeof(T) /* top padding */ - - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */ - - in_stride_w * id[3]; + const uint32_t offset_base = + padded_offset - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */ + - pad_top * sizeof(T) /* top padding */ + - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - + pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */ + - in_stride_w * id[3]; return offset_base; } else { - const uint32_t offset_base = padded_offset - - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row - - pad_top * sizeof(T) // top padding - - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems + const uint32_t offset_base = padded_offset - + sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row + - pad_top * sizeof(T) // top padding + - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * + pool_stride_y // for each Z plane there are width*pad_right padding elems - in_stride_w * id[3]; return offset_base; @@ -100,4 +110,4 @@ inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H \ No newline at end of file +#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp index c342b96426..ee4a67b0fb 100644 --- a/src/cpu/kernels/pool2d/neon/nchw/all.cpp +++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp @@ -25,9 +25,11 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool2d/neon/list.h" + #include #ifdef ENABLE_NCHW_KERNELS @@ -38,15 +40,19 @@ namespace cpu #define READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ (x == width + pad_left - 1) ? vset_lane_f32(*(ptr), vdup_n_f32(fval), 0) : vld1_f32(ptr) #define READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ - (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) -#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ - ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) ? vdup_n_f32(fval) : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) + (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) \ + : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) +#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ + ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) \ + ? vdup_n_f32(fval) \ + : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) #define READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ vcombine_f32(READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval), \ READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 2), y, (ptr + 2), fval)) -float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval) +float32x4x2_t +read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval) { float32x4x2_t vec; vec.val[0] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval); @@ -56,13 +62,14 @@ float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval) +float16x4_t +read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval) { float16_t vec[4]; const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t))); - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) + if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) { vec[i] = *(ptr + i); } @@ -74,94 +81,106 @@ float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, return wrapper::vload(vec); } -void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling3_fp16_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); Iterator out(dst0, window); - constexpr const int pool_size = 3; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float16_t fp16_min = get_initial_min(pool_info.use_inf_as_limit); - const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f; - const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - const auto y_val_2 = (id.y() * pool_stride_y) + 2; - float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_0, reinterpret_cast(src_top_ptr + in.offset()), fill_value); - float16x4_t middle_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_1, reinterpret_cast(src_middle_ptr + in.offset()), fill_value); - float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_2, reinterpret_cast(src_bottom_ptr + in.offset()), fill_value); - float16x4_t res = {}; - - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + constexpr const int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float16_t fp16_min = get_initial_min(pool_info.use_inf_as_limit); + const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f; + const unsigned char *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); + const unsigned char *const src_middle_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); + const unsigned char *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2)); + + execute_window_loop( + window, + [&](const Coordinates &id) { - top_data = vmul_f16(top_data, top_data); - middle_data = vmul_f16(middle_data, middle_data); - bottom_data = vmul_f16(bottom_data, bottom_data); - } + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + const auto y_val_2 = (id.y() * pool_stride_y) + 2; + float16x4_t top_data = + read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0, + reinterpret_cast(src_top_ptr + in.offset()), fill_value); + float16x4_t middle_data = read_4_boundary_aware_fp16( + src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1, + reinterpret_cast(src_middle_ptr + in.offset()), fill_value); + float16x4_t bottom_data = read_4_boundary_aware_fp16( + src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_2, + reinterpret_cast(src_bottom_ptr + in.offset()), fill_value); + float16x4_t res = {}; - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float16x4_t scale_v = vdup_n_f16(scale); - // Perform pooling - const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data); - res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data); - res = vmul_f16(vpadd_f16(res, res), scale_v); - } - else - { - const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data); - res = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data); - res = vpmax_f16(res, res); - } + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f16(top_data, top_data); + middle_data = vmul_f16(middle_data, middle_data); + bottom_data = vmul_f16(bottom_data, bottom_data); + } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = vsqrt_f16(res); - } + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, + pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float16x4_t scale_v = vdup_n_f16(scale); + // Perform pooling + const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data); + res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data); + res = vmul_f16(vpadd_f16(res, res), scale_v); + } + else + { + const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data); + res = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data); + res = vpmax_f16(res, res); + } + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = vsqrt_f16(res); + } - *(reinterpret_cast(out.ptr())) = vget_lane_f16(res, 0); - }, - in, out); + *(reinterpret_cast(out.ptr())) = vget_lane_f16(res, 0); + }, + in, out); } template -inline typename std::enable_if::value, float32x2_t>::type -f16_to_f32(float16x4_t in) +inline typename std::enable_if::value, float32x2_t>::type f16_to_f32(float16x4_t in) { - float32x2_t out = { static_cast(vget_lane_f16(in, 0)), static_cast(vget_lane_f16(in, 1)) }; + float32x2_t out = {static_cast(vget_lane_f16(in, 0)), static_cast(vget_lane_f16(in, 1))}; return out; } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ template -inline typename std::enable_if::value, float32x2_t>::type -f16_to_f32(float32x2_t in) +inline typename std::enable_if::value, float32x2_t>::type f16_to_f32(float32x2_t in) { return in; } @@ -171,9 +190,9 @@ auto read_2_boundary_aware(int srcw, int srch, int pad_l, int pad_t, int x, int { T vec[2]; const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t))); - for(int i = 0; i < 2; i++) + for (int i = 0; i < 2; i++) { - if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) + if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) { vec[i] = *(ptr + i); } @@ -186,61 +205,80 @@ auto read_2_boundary_aware(int srcw, int srch, int pad_l, int pad_t, int x, int } template -void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_nchw_maxpool_indices(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { Iterator in(src, window_src); Iterator out(dst0, window); Iterator indices(dst1, window); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - const int pad_left = src->info()->padding().left; - const int pad_right = src->info()->padding().right; - const int in_stride_y = static_cast(src->info()->strides_in_bytes().y()); - const T float_min = get_initial_min(pool_info.use_inf_as_limit); - const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f; - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - auto top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_0, reinterpret_cast(src_top_ptr + in.offset()), fill_value); - auto bottom_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_1, reinterpret_cast(src_bottom_ptr + in.offset()), fill_value); - float32x2_t top_data_f32 = f16_to_f32(top_data); - float32x2_t bottom_data_f32 = f16_to_f32(bottom_data); - - // Calculate max data, compare top first, then bottom, to make sue the first max is recorded. - const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32); - const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32); - const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom); - *(reinterpret_cast(out.ptr())) = static_cast(vget_lane_f32(max_data, 0)); - - // Calculate max data indice, which will be used in max unpool. - const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW); - const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T)); - const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left; - const uint32x2_t voffset_top = { offset_top, offset_top + 1u }; - const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u }; - const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top)); - const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom)); - *(reinterpret_cast(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0); - }, - in, out, indices); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const uint8_t *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); + const uint8_t *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); + const int pad_left = src->info()->padding().left; + const int pad_right = src->info()->padding().right; + const int in_stride_y = static_cast(src->info()->strides_in_bytes().y()); + const T float_min = get_initial_min(pool_info.use_inf_as_limit); + const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f; + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + auto top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0, + reinterpret_cast(src_top_ptr + in.offset()), fill_value); + auto bottom_data = + read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1, + reinterpret_cast(src_bottom_ptr + in.offset()), fill_value); + float32x2_t top_data_f32 = f16_to_f32(top_data); + float32x2_t bottom_data_f32 = f16_to_f32(bottom_data); + + // Calculate max data, compare top first, then bottom, to make sue the first max is recorded. + const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32); + const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32); + const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom); + *(reinterpret_cast(out.ptr())) = static_cast(vget_lane_f32(max_data, 0)); + + // Calculate max data indice, which will be used in max unpool. + const uint32_t offset_base = + offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW); + const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T)); + const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left; + const uint32x2_t voffset_top = {offset_top, offset_top + 1u}; + const uint32x2_t voffset_bottom = {offset_bottom, offset_bottom + 1u}; + const uint32x2_t tmp_indices_top = + vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top)); + const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), + voffset_bottom, vrev64_u32(voffset_bottom)); + *(reinterpret_cast(indices.ptr())) = vget_lane_u32( + vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0); + }, + in, out, indices); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_fp16_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { - if(pool_info.pool_type == PoolingType::MAX && dst1) + if (pool_info.pool_type == PoolingType::MAX && dst1) { pooling2_nchw_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); } @@ -254,244 +292,274 @@ void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P const int pool_pad_left = pool_info.pad_stride_info.pad_left(); const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); int pool_stride_x, pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float16_t fp16_min = get_initial_min(pool_info.use_inf_as_limit); - const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f; - - const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_top_ptr = reinterpret_cast(src_top_ptr + in.offset()); - const auto in_bottom_ptr = reinterpret_cast(src_bottom_ptr + in.offset()); - - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_0, in_top_ptr, fill_value); - float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_1, in_bottom_ptr, fill_value); - float16x4_t res = {}; - - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float16_t fp16_min = get_initial_min(pool_info.use_inf_as_limit); + const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f; + + const unsigned char *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); + const unsigned char *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); + + execute_window_loop( + window, + [&](const Coordinates &id) { - top_data = vmul_f16(top_data, top_data); - bottom_data = vmul_f16(bottom_data, bottom_data); - } + const auto in_top_ptr = reinterpret_cast(src_top_ptr + in.offset()); + const auto in_bottom_ptr = reinterpret_cast(src_bottom_ptr + in.offset()); + + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, + y_val_0, in_top_ptr, fill_value); + float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, + y_val_1, in_bottom_ptr, fill_value); + float16x4_t res = {}; - if(pool_info.pool_type != PoolingType::MAX) - { - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float16x4_t scale_v = vdup_n_f16(scale); + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f16(top_data, top_data); + bottom_data = vmul_f16(bottom_data, bottom_data); + } - const float16x4_t sum_data = vadd_f16(top_data, bottom_data); - res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v); - } - else - { - const float16x4_t max_data = vmax_f16(top_data, bottom_data); - res = vpmax_f16(max_data, max_data); - } + if (pool_info.pool_type != PoolingType::MAX) + { + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float16x4_t scale_v = vdup_n_f16(scale); - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = vsqrt_f16(res); - } + const float16x4_t sum_data = vadd_f16(top_data, bottom_data); + res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v); + } + else + { + const float16x4_t max_data = vmax_f16(top_data, bottom_data); + res = vpmax_f16(max_data, max_data); + } - // Store result - *(reinterpret_cast(out.ptr())) = vget_lane_f16(res, 0); - }, - in, out); + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = vsqrt_f16(res); + } + + // Store result + *(reinterpret_cast(out.ptr())) = vget_lane_f16(res, 0); + }, + in, out); } } -void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_fp16_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); Iterator out(dst0, window); - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float16_t fp16_min = get_initial_min(pool_info.use_inf_as_limit); - const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f; - - execute_window_loop(window, [&](const Coordinates & id) - { - float16_t res = 0.0f; - - if(pool_info.pool_type != PoolingType::MAX) + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float16_t fp16_min = get_initial_min(pool_info.use_inf_as_limit); + const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f; + + execute_window_loop( + window, + [&](const Coordinates &id) { - // Calculate scale - const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + float16_t res = 0.0f; - // Perform pooling - for(int y = 0; y < pool_size_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = 0; x < pool_size_x; ++x) + // Calculate scale + const float16_t scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = 0; y < pool_size_y; ++y) { - const auto ptr = reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) - + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().y())); + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast( + in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().y())); - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; - if(pool_info.pool_type == PoolingType::L2) - { - data *= data; - } + if (pool_info.pool_type == PoolingType::L2) + { + data *= data; + } - res += data; + res += data; + } } - } - // Divide by scale - res *= scale; - } - else // if max pooling - { - res = fp16_min; - - for(int y = 0; y < pool_size_y; ++y) + // Divide by scale + res *= scale; + } + else // if max pooling { - for(int x = 0; x < pool_size_x; ++x) - { - const auto ptr = reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) - + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().y())); + res = fp16_min; - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; - res = std::max(res, data); + for (int y = 0; y < pool_size_y; ++y) + { + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast( + in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().y())); + + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + res = std::max(res, data); + } } } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); - } + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } - // Store result - *(reinterpret_cast(out.ptr())) = res; - }, - in, out); + // Store result + *(reinterpret_cast(out.ptr())) = res; + }, + in, out); } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); Iterator out(dst0, window); - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float min_value = get_initial_min(pool_info.use_inf_as_limit); - const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; - - execute_window_loop(window, [&](const Coordinates & id) - { - float res = 0.0f; - - if(pool_info.pool_type != PoolingType::MAX) + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + execute_window_loop( + window, + [&](const Coordinates &id) { - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + float res = 0.0f; - // Perform pooling - for(int y = 0; y < pool_size_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = 0; x < pool_size_x; ++x) + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = 0; y < pool_size_y; ++y) { - const auto ptr = reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) - + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().y())); + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast( + in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().y())); - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; - if(pool_info.pool_type == PoolingType::L2) - { - data *= data; - } + if (pool_info.pool_type == PoolingType::L2) + { + data *= data; + } - res += data; + res += data; + } } - } - // Divide by scale - res *= scale; - } - else // if max pooling - { - res = min_value; - - for(int y = 0; y < pool_size_y; ++y) + // Divide by scale + res *= scale; + } + else // if max pooling { - for(int x = 0; x < pool_size_x; ++x) - { - const auto ptr = reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) - + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().y())); + res = min_value; - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; - res = std::max(res, data); + for (int y = 0; y < pool_size_y; ++y) + { + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast( + in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().y())); + + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + res = std::max(res, data); + } } } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); - } + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } - // Store result - *(reinterpret_cast(out.ptr())) = res; - }, - in, out); + // Store result + *(reinterpret_cast(out.ptr())) = res; + }, + in, out); } -void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { - if(pool_info.pool_type == PoolingType::MAX && dst1) + if (pool_info.pool_type == PoolingType::MAX && dst1) { pooling2_nchw_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); } @@ -499,64 +567,168 @@ void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P { Iterator in(src, window_src); Iterator out(dst0, window); - constexpr int pool_size = 2; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + constexpr int pool_size = 2; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float min_value = get_initial_min(pool_info.use_inf_as_limit); - const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + const uint8_t *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); + const uint8_t *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto in_top_ptr = reinterpret_cast(src_top_ptr + in.offset()); + const auto in_bottom_ptr = reinterpret_cast(src_bottom_ptr + in.offset()); + + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + auto top_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, + in_top_ptr, fill_value); + auto bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, + in_bottom_ptr, fill_value); + float32x2_t res = {}; + float final_res = 0; - const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f32(top_data, top_data); + bottom_data = vmul_f32(bottom_data, bottom_data); + } + + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, + pool_size, upper_bound_w, upper_bound_h, pool_pad_left, + pool_pad_top, pool_stride_x, pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); + + // Perform pooling + const float32x2_t sum_data = vadd_f32(top_data, bottom_data); + res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v); + } + else + { + const float32x2_t max_data = vmax_f32(top_data, bottom_data); + res = vpmax_f32(max_data, max_data); + } + final_res = vget_lane_f32(res, 0); + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } - execute_window_loop(window, [&](const Coordinates & id) + // Store result + *(reinterpret_cast(out.ptr())) = final_res; + }, + in, out); + } +} + +void pooling3_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + constexpr const int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + const uint8_t *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); + const uint8_t *const src_middle_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); + const uint8_t *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2)); + + execute_window_loop( + window, + [&](const Coordinates &id) { const auto in_top_ptr = reinterpret_cast(src_top_ptr + in.offset()); + const auto in_middle_ptr = reinterpret_cast(src_middle_ptr + in.offset()); const auto in_bottom_ptr = reinterpret_cast(src_bottom_ptr + in.offset()); - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - auto top_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value); - auto bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_bottom_ptr, fill_value); - float32x2_t res = {}; - float final_res = 0; + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + const auto y_val_2 = (id.y() * pool_stride_y) + 2; + auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, + fill_value); + auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, + in_middle_ptr, fill_value); + auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2, + in_bottom_ptr, fill_value); + + float32x2_t res = {}; + float final_res = 0; // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + if (pool_info.pool_type == PoolingType::L2) { - top_data = vmul_f32(top_data, top_data); - bottom_data = vmul_f32(bottom_data, bottom_data); + top_data = vmulq_f32(top_data, top_data); + middle_data = vmulq_f32(middle_data, middle_data); + bottom_data = vmulq_f32(bottom_data, bottom_data); } - if(pool_info.pool_type != PoolingType::MAX) + if (pool_info.pool_type != PoolingType::MAX) { // Calculate scale - float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, + pool_size, upper_bound_w, upper_bound_h, pool_pad_left, + pool_pad_top, pool_stride_x, pool_stride_y); const float32x2_t scale_v = vdup_n_f32(scale); // Perform pooling - const float32x2_t sum_data = vadd_f32(top_data, bottom_data); - res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v); + const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data); + res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data)); + res = vmul_f32(vpadd_f32(res, res), scale_v); } else { - const float32x2_t max_data = vmax_f32(top_data, bottom_data); - res = vpmax_f32(max_data, max_data); + const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data); + res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data)); + res = vpmax_f32(res, res); } final_res = vget_lane_f32(res, 0); // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + if (pool_info.pool_type == PoolingType::L2) { final_res = sqrt(final_res); } @@ -565,191 +737,120 @@ void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P *(reinterpret_cast(out.ptr())) = final_res; }, in, out); - } -} - -void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - ARM_COMPUTE_UNUSED(dst1); - Iterator in(src, window_src); - Iterator out(dst0, window); - - constexpr const int pool_size = 3; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float min_value = get_initial_min(pool_info.use_inf_as_limit); - const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; - - const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_top_ptr = reinterpret_cast(src_top_ptr + in.offset()); - const auto in_middle_ptr = reinterpret_cast(src_middle_ptr + in.offset()); - const auto in_bottom_ptr = reinterpret_cast(src_bottom_ptr + in.offset()); - - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - const auto y_val_2 = (id.y() * pool_stride_y) + 2; - auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value); - auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_middle_ptr, fill_value); - auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2, in_bottom_ptr, fill_value); - - float32x2_t res = {}; - float final_res = 0; - - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - top_data = vmulq_f32(top_data, top_data); - middle_data = vmulq_f32(middle_data, middle_data); - bottom_data = vmulq_f32(bottom_data, bottom_data); - } - - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float32x2_t scale_v = vdup_n_f32(scale); - - // Perform pooling - const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data); - res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data)); - res = vmul_f32(vpadd_f32(res, res), scale_v); - } - else - { - const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data); - res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data)); - res = vpmax_f32(res, res); - } - final_res = vget_lane_f32(res, 0); - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - final_res = sqrt(final_res); - } - - // Store result - *(reinterpret_cast(out.ptr())) = final_res; - }, - in, out); } -void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling7_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); Iterator out(dst0, window); - constexpr const int pool_size = 7; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + constexpr const int pool_size = 7; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float min_value = get_initial_min(pool_info.use_inf_as_limit); - const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; - - std::array src_ptrs{ {} }; - for(int i = 0; i < pool_size; ++i) + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + std::array src_ptrs{{}}; + for (int i = 0; i < pool_size; ++i) { - src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + i)); + src_ptrs[i] = + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + i)); } - execute_window_loop(window, [&](const Coordinates & id) - { - auto in_ptr = reinterpret_cast(src_ptrs[0] + in.offset()); - - auto x_val = id.x() * pool_stride_x; - auto y_val = id.y() * pool_stride_y; - float32x4x2_t data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value); + execute_window_loop( + window, + [&](const Coordinates &id) + { + auto in_ptr = reinterpret_cast(src_ptrs[0] + in.offset()); - float32x2_t res = {}; - float final_res = 0.f; + auto x_val = id.x() * pool_stride_x; + auto y_val = id.y() * pool_stride_y; + float32x4x2_t data = + read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value); - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float32x2_t scale_v = vdup_n_f32(scale); + float32x2_t res = {}; + float final_res = 0.f; - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + if (pool_info.pool_type != PoolingType::MAX) { - data.val[0] = vmulq_f32(data.val[0], data.val[0]); - data.val[1] = vmulq_f32(data.val[1], data.val[1]); - } - float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3)); - for(int i = 1; i < pool_size; ++i) - { - in_ptr = reinterpret_cast(src_ptrs[i] + in.offset()); + // Calculate scale + float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, + pool_size, upper_bound_w, upper_bound_h, pool_pad_left, + pool_pad_top, pool_stride_x, pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); - x_val = id.x() * pool_stride_x; - y_val = (id.y() * pool_stride_y) + i; - data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value); // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + if (pool_info.pool_type == PoolingType::L2) { data.val[0] = vmulq_f32(data.val[0], data.val[0]); data.val[1] = vmulq_f32(data.val[1], data.val[1]); } - sum_data = vaddq_f32(sum_data, data.val[0]); - sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3)); + float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3)); + for (int i = 1; i < pool_size; ++i) + { + in_ptr = reinterpret_cast(src_ptrs[i] + in.offset()); + + x_val = id.x() * pool_stride_x; + y_val = (id.y() * pool_stride_y) + i; + data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, + fill_value); + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + data.val[0] = vmulq_f32(data.val[0], data.val[0]); + data.val[1] = vmulq_f32(data.val[1], data.val[1]); + } + sum_data = vaddq_f32(sum_data, data.val[0]); + sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3)); + } + res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data)); + res = vmul_f32(vpadd_f32(res, res), scale_v); } - res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data)); - res = vmul_f32(vpadd_f32(res, res), scale_v); - } - else - { - for(int i = 1; i < pool_size; ++i) + else { - in_ptr = reinterpret_cast(src_ptrs[i] + in.offset()); + for (int i = 1; i < pool_size; ++i) + { + in_ptr = reinterpret_cast(src_ptrs[i] + in.offset()); - x_val = id.x() * pool_stride_x; - y_val = (id.y() * pool_stride_y) + i; - float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value); - data = vmax2q_f32(data, temp); + x_val = id.x() * pool_stride_x; + y_val = (id.y() * pool_stride_y) + i; + float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, + in_ptr, fill_value); + data = vmax2q_f32(data, temp); + } + res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1])); + res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0]))); + res = vpmax_f32(res, res); } - res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1])); - res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0]))); - res = vpmax_f32(res, res); - } - final_res = vget_lane_f32(res, 0); + final_res = vget_lane_f32(res, 0); - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - final_res = sqrt(final_res); - } + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } - // Store result - *(reinterpret_cast(out.ptr())) = final_res; - }, - in, out); + // Store result + *(reinterpret_cast(out.ptr())) = final_res; + }, + in, out); } } // namespace cpu } // namespace arm_compute -#endif // ENABLE_NCHW_KERNELS \ No newline at end of file +#endif // ENABLE_NCHW_KERNELS diff --git a/src/cpu/kernels/pool2d/neon/qasymm8.cpp b/src/cpu/kernels/pool2d/neon/qasymm8.cpp index 7f8841edd8..44675b5394 100644 --- a/src/cpu/kernels/pool2d/neon/qasymm8.cpp +++ b/src/cpu/kernels/pool2d/neon/qasymm8.cpp @@ -25,17 +25,23 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool2d/neon/list.h" namespace arm_compute { namespace cpu { -void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { poolingMxN_q8_neon_nhwc(src, dst0, dst1, pool_info, window_src, window); } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp index 8643651f27..d434323e89 100644 --- a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp @@ -25,17 +25,23 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool2d/neon/list.h" namespace arm_compute { namespace cpu { -void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { poolingMxN_q8_neon_nhwc(src, dst0, dst1, pool_info, window_src, window); } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h index a2cd3991be..38f1b2f1f9 100644 --- a/src/cpu/kernels/pool2d/neon/quantized.h +++ b/src/cpu/kernels/pool2d/neon/quantized.h @@ -26,11 +26,13 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/helpers/PoolingHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/PoolingHelpers.h" + #include namespace arm_compute @@ -38,7 +40,12 @@ namespace arm_compute namespace cpu { template -void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_q8_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); @@ -60,15 +67,15 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P using q32_t = typename wrapper::traits::promote_t; using q32x4_t = typename wrapper::traits::neon_vector::type; - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; const int pool_pad_right = pool_info.pad_stride_info.pad_right(); const int pool_pad_top = pool_info.pad_stride_info.pad_top(); const int pool_pad_left = pool_info.pad_stride_info.pad_left(); const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); @@ -80,233 +87,267 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; // "new_offset" doesn't have to consider the "half_scale_v" in its computation // With a requantization performed in a single step there won't be uncertainties introduced - const int32_t new_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / quant_rescale); + const int32_t new_offset = + dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / quant_rescale); - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - if(pool_info.pool_type != PoolingType::MAX) + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { - q32x4_t vres1 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres2 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres3 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres4 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - - // Perform pooling - for(int y = pool_start_y; y < pool_end_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = pool_start_x; x < pool_end_x; ++x) + q32x4_t vres1 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres2 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres3 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres4 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = pool_start_y; y < pool_end_y; ++y) { - const q8x16_t data = wrapper::vloadq(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - - const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); - const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); - vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); - vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); - vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); - vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x16_t data = wrapper::vloadq( + reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + + const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); + const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); + vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); + vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); + vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); + vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + } } - } - if(src_qinfo != dst_qinfo) - { - const float32x4x4_t vres = + if (src_qinfo != dst_qinfo) { - { + const float32x4x4_t vres = {{ vcvtq_f32_q32(vres1), vcvtq_f32_q32(vres2), vcvtq_f32_q32(vres3), vcvtq_f32_q32(vres4), - } - }; - const auto requantized_dst = vrequantize_pooling_with_scale(vres, quant_rescale, scale, new_offset); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); + }}; + const auto requantized_dst = + vrequantize_pooling_with_scale(vres, quant_rescale, scale, new_offset); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, + wrapper::vgethigh(requantized_dst)); + } + else + { + const float32x4_t scale_v = vdupq_n_f32(scale); + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + vres1 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); + + const q8x8_t res1 = + wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); + const q8x8_t res2 = + wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, res1); + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, res2); + } } else { - const float32x4_t scale_v = vdupq_n_f32(scale); - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - vres1 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); - vres2 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); - vres3 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); - vres4 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); - - const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); - const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, res1); - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, res2); - } - } - else - { - q8x16_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_128_tag{}); + q8x16_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_128_tag{}); - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) + for (int y = pool_start_y; y < pool_end_y; ++y) { - const q8x16_t data = wrapper::vloadq(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - vres = wrapper::vmax(vres, data); + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x16_t data = wrapper::vloadq( + reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + vres = wrapper::vmax(vres, data); + } } - } - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling(wrapper::vgetlow(vres), wrapper::vgethigh(vres), - requant_qinfo) : - vres); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) + ? vrequantize_pooling(wrapper::vgetlow(vres), + wrapper::vgethigh(vres), requant_qinfo) + : vres); + } } - } - if(pool_info.pool_type == PoolingType::MAX) - { - for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) + if (pool_info.pool_type == PoolingType::MAX) { - q8x8_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_64_tag{}); - for(int y = pool_start_y; y < pool_end_y; ++y) + for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) { - for(int x = pool_start_x; x < pool_end_x; ++x) + q8x8_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_64_tag{}); + for (int y = pool_start_y; y < pool_end_y; ++y) { - const q8x8_t data = wrapper::vload(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - vres = wrapper::vmax(vres, data); + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x8_t data = wrapper::vload( + reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + vres = wrapper::vmax(vres, data); + } } - } - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, - (src_qinfo != dst_qinfo) ? vrequantize_pooling(vres, requant_qinfo) : vres); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) ? vrequantize_pooling(vres, requant_qinfo) : vres); + } } - } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - if(pool_info.pool_type != PoolingType::MAX) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - q32_t res = static_cast(0.f); + if (pool_info.pool_type != PoolingType::MAX) + { + q32_t res = static_cast(0.f); - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - // Perform pooling - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) + // Perform pooling + for (int y = pool_start_y; y < pool_end_y; ++y) { - const T data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - res += data; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const T data = + *(reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + res += data; + } } - } - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast(res); - const float new_scale = quant_rescale / scale; - const auto requantized_dst = quantize(res_f, UniformQuantizationInfo(new_scale, new_offset)); + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast(res); + const float new_scale = quant_rescale / scale; + const auto requantized_dst = quantize(res_f, UniformQuantizationInfo(new_scale, new_offset)); - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = requantized_dst; + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = requantized_dst; + } + else + { + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + res = static_cast(0.5f + static_cast(res) * scale); + + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; + } } else { - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - res = static_cast(0.5f + static_cast(res) * scale); - - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - } - else - { - T res = std::numeric_limits::min(); + T res = std::numeric_limits::min(); - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) + for (int y = pool_start_y; y < pool_end_y; ++y) { - const T data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - res = std::max(res, data); + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const T data = + *(reinterpret_cast( + in.ptr() + + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z())) + + x_off); + res = std::max(res, data); + } } - } - // Store result - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast(res); - *(reinterpret_cast(out.ptr()) + x_off) = quantize(res_f, requant_qinfo); - } - else - { - *(reinterpret_cast(out.ptr()) + x_off) = res; + // Store result + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast(res); + *(reinterpret_cast(out.ptr()) + x_off) = quantize(res_f, requant_qinfo); + } + else + { + *(reinterpret_cast(out.ptr()) + x_off) = res; + } } } - } - - }, - in, out); + }, + in, out); } #if defined(ENABLE_NCHW_KERNELS) template -inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step, - const int pool_size, const int upper_bound_w, const int upper_bound_h, - const int pad_x, const int pad_y, const int stride_x, const int stride_y) +inline void scale_vector_q16x8(bool exclude_padding, + TVec &v, + const Coordinates &id, + int id_offset, + int step, + const int pool_size, + const int upper_bound_w, + const int upper_bound_h, + const int pad_x, + const int pad_y, + const int stride_x, + const int stride_y) { int start_x = (id.x() + id_offset) * stride_x - pad_x; int start_y = id.y() * stride_y - pad_y; const int end_y = std::min(start_y + pool_size, upper_bound_h); - if(exclude_padding) + if (exclude_padding) { start_y = std::max(0, start_y); } - std::array elems = - { - { - wrapper::vgetlane(v, 0), - wrapper::vgetlane(v, 1), - wrapper::vgetlane(v, 2), - wrapper::vgetlane(v, 3), - wrapper::vgetlane(v, 4), - wrapper::vgetlane(v, 5), - wrapper::vgetlane(v, 6), - wrapper::vgetlane(v, 7), - } - }; - - for(auto &el : elems) + std::array elems = {{ + wrapper::vgetlane(v, 0), + wrapper::vgetlane(v, 1), + wrapper::vgetlane(v, 2), + wrapper::vgetlane(v, 3), + wrapper::vgetlane(v, 4), + wrapper::vgetlane(v, 5), + wrapper::vgetlane(v, 6), + wrapper::vgetlane(v, 7), + }}; + + for (auto &el : elems) { int c_start_x = start_x; const int end_x = std::min(c_start_x + pool_size, upper_bound_w); - if(exclude_padding) + if (exclude_padding) { c_start_x = std::max(0, c_start_x); } @@ -326,15 +367,16 @@ inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates } template -auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval) +auto load16_boundary_aware( + int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval) { ARM_COMPUTE_UNUSED(pad_b, pad_r); T vec[16]; //handle reading a row out of the tensor const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t))); - for(int i = 0; i < 16; i++) + for (int i = 0; i < 16; i++) { - if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) + if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) { vec[i] = *(ptr + i); } @@ -349,24 +391,24 @@ auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t, template inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &upper, T *ptr) { - if(deinterleave) + if (deinterleave) { - for(int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i) + for (int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i) { *(ptr + i * 2) = lower[i]; } - for(int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i) + for (int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i) { *(ptr + 1 + i * 2) = upper[i]; } } else { - for(int i = 0; i < 8 && (i + x) < dst_w; ++i) + for (int i = 0; i < 8 && (i + x) < dst_w; ++i) { *(ptr + i) = lower[i]; } - for(int i = 0; i < 8 && (i + x + 8) < dst_w; ++i) + for (int i = 0; i < 8 && (i + x + 8) < dst_w; ++i) { *(ptr + i + 8) = upper[i]; } @@ -376,14 +418,19 @@ inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &up template inline void write8_boundary_aware(int x, int dst_w, const V &v, T *ptr) { - for(int i = 0; i < 8 && (i + x) < dst_w; ++i) + for (int i = 0; i < 8 && (i + x) < dst_w; ++i) { *(ptr + i) = v[i]; } } template -void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_quantized_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); @@ -397,129 +444,136 @@ void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds using q16x8_t = typename wrapper::traits::neon_vector::type; using q16x8x2_t = typename wrapper::traits::neon_vector::type; - constexpr int pool_size = 2; - int pool_stride_x = 0; - int pool_stride_y = 0; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + constexpr int pool_size = 2; + int pool_stride_x = 0; + int pool_stride_y = 0; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const T *const src_top_ptr = reinterpret_cast(src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top)))); - const T *const src_bottom_ptr = reinterpret_cast(src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1))); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const T *const src_top_ptr = reinterpret_cast( + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top)))); + const T *const src_bottom_ptr = reinterpret_cast( + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1))); const int scale_step_x = (pool_stride_x == 1) ? 2 : 1; const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); const bool have_different_qinfo = src_qinfo != dst_qinfo; - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int dst_w = dst0->info()->dimension(0); + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int dst_w = dst0->info()->dimension(0); const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? std::numeric_limits::min() : T(0); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - - auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, - x_val, y_val_0, reinterpret_cast(src_top_ptr + in.offset()), fill_value); - auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, - x_val, y_val_1, reinterpret_cast(src_bottom_ptr + in.offset()), fill_value); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; - q8x8_t lower_res = {}; - q8x8_t upper_res = {}; + auto top_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_0, reinterpret_cast(src_top_ptr + in.offset()), fill_value); + auto bottom_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_1, reinterpret_cast(src_bottom_ptr + in.offset()), fill_value); - if(pool_info.pool_type != PoolingType::MAX) - { - const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } }; - const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } }; + q8x8_t lower_res = {}; + q8x8_t upper_res = {}; - // Add rows - const q16x8x2_t vrsum = + if (pool_info.pool_type != PoolingType::MAX) { - { + const q16x8x2_t top_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}}; + const q16x8x2_t bottom_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}}; + + // Add rows + const q16x8x2_t vrsum = {{ wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), - } - }; + }}; - // Pair-wise add row data - const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0])); - const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1])); + // Pair-wise add row data + const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0])); + const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1])); - q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2); + q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2); - // Scale lower result - scale_vector_q16x8(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - lower_res = wrapper::vmovn(res_lower); + // Scale lower result + scale_vector_q16x8(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + lower_res = wrapper::vmovn(res_lower); - // Compute upper result for stride_x == 1 - if(pool_stride_x == 1) - { - // Shifted row sum - const q16x8x2_t vrsum_shifted = + // Compute upper result for stride_x == 1 + if (pool_stride_x == 1) { - { - wrapper::vext_1(vrsum.val[0], vrsum.val[1]), - wrapper::vext_1(vrsum.val[1], vrsum.val[1]) - } - }; - - // Pair-wise add shifted row - q16x8_t res_upper = wrapper::vcombine( - wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])), - wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1]))); - - // Scale upper result - scale_vector_q16x8(pool_info.exclude_padding, res_upper, id, 1, 2, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - upper_res = wrapper::vmovn(res_upper); + // Shifted row sum + const q16x8x2_t vrsum_shifted = { + {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}}; + + // Pair-wise add shifted row + q16x8_t res_upper = wrapper::vcombine( + wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])), + wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), + wrapper::vgethigh(vrsum_shifted.val[1]))); + + // Scale upper result + scale_vector_q16x8(pool_info.exclude_padding, res_upper, id, 1, 2, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + upper_res = wrapper::vmovn(res_upper); + } } - } - else - { - const q8x16_t max_data = wrapper::vmax(top_data, bottom_data); - lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data)); - if(pool_stride_x == 1) + else { - const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data); - upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted)); + const q8x16_t max_data = wrapper::vmax(top_data, bottom_data); + lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data)); + if (pool_stride_x == 1) + { + const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data); + upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted)); + } } - } - if(have_different_qinfo) - { - const auto requantized_dst = vrequantize_pooling(lower_res, upper_res, requant_qinfo); - lower_res = wrapper::vgetlow(requantized_dst); - upper_res = wrapper::vgethigh(requantized_dst); - } - auto out_ptr = reinterpret_cast(out.ptr()); - // Store result - if(pool_stride_x == 1) - { - write16_boundary_aware(id.x(), dst_w, lower_res, upper_res, out_ptr); - } - else - { - write8_boundary_aware(id.x(), dst_w, lower_res, out_ptr); - } - }, - in, out); + if (have_different_qinfo) + { + const auto requantized_dst = vrequantize_pooling(lower_res, upper_res, requant_qinfo); + lower_res = wrapper::vgetlow(requantized_dst); + upper_res = wrapper::vgethigh(requantized_dst); + } + auto out_ptr = reinterpret_cast(out.ptr()); + // Store result + if (pool_stride_x == 1) + { + write16_boundary_aware(id.x(), dst_w, lower_res, upper_res, out_ptr); + } + else + { + write8_boundary_aware(id.x(), dst_w, lower_res, out_ptr); + } + }, + in, out); } template -void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling3_quantized_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); @@ -533,13 +587,13 @@ void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds using q16x8_t = typename wrapper::traits::neon_vector::type; using q16x8x2_t = typename wrapper::traits::neon_vector::type; - constexpr int pool_size = 3; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + constexpr int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); @@ -547,147 +601,145 @@ void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - const T *const src_top_ptr = reinterpret_cast(src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top)))); - const T *const src_middle_ptr = reinterpret_cast(src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1))); - const T *const src_bottom_ptr = reinterpret_cast(src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2))); + const T *const src_top_ptr = reinterpret_cast( + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top)))); + const T *const src_middle_ptr = reinterpret_cast( + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1))); + const T *const src_bottom_ptr = reinterpret_cast( + src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2))); const int src_w = src->info()->dimension(0); const int src_h = src->info()->dimension(1); const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits::min(); const int dst_w = dst0->info()->dimension(0); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - const auto y_val_2 = (id.y() * pool_stride_y) + 2; - - auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, - x_val, y_val_0, reinterpret_cast(src_top_ptr + in.offset()), fill_value); - auto middle_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, - x_val, y_val_1, reinterpret_cast(src_middle_ptr + in.offset()), fill_value); - auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, - x_val, y_val_2, reinterpret_cast(src_bottom_ptr + in.offset()), fill_value); - - q8x8_t fres = {}; - q8x16_t fqres = {}; - - if(pool_info.pool_type == PoolingType::AVG) + execute_window_loop( + window, + [&](const Coordinates &id) { - // Convert data to u16 - const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } }; - const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } }; - const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } }; - - // Calculate row sums - const q16x8x2_t vrsum = + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + const auto y_val_2 = (id.y() * pool_stride_y) + 2; + + auto top_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_0, reinterpret_cast(src_top_ptr + in.offset()), fill_value); + auto middle_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_1, reinterpret_cast(src_middle_ptr + in.offset()), fill_value); + auto bottom_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_2, reinterpret_cast(src_bottom_ptr + in.offset()), fill_value); + + q8x8_t fres = {}; + q8x16_t fqres = {}; + + if (pool_info.pool_type == PoolingType::AVG) { + // Convert data to u16 + const q16x8x2_t top_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}}; + const q16x8x2_t middle_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data))}}; + const q16x8x2_t bottom_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}}; + + // Calculate row sums + const q16x8x2_t vrsum = {{ + wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]), + wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]), + }}; + const q16x8x2_t vrsum_shifted_1 = { + {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}}; + const q16x8x2_t vrsum_shifted_2 = { + {wrapper::vext_2(vrsum.val[0], vrsum.val[1]), wrapper::vext_2(vrsum.val[1], vrsum.val[1])}}; + // Calculate final sum + q16x8x2_t final_sum = {{ + wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]), + wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]), + }}; + if (pool_stride_x == 2) { - wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]), - wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]), + q16x8_t res = { + wrapper::vgetlane(final_sum.val[0], 0), wrapper::vgetlane(final_sum.val[0], 2), + wrapper::vgetlane(final_sum.val[0], 4), wrapper::vgetlane(final_sum.val[0], 6), + wrapper::vgetlane(final_sum.val[1], 0), wrapper::vgetlane(final_sum.val[1], 2), + wrapper::vgetlane(final_sum.val[1], 4), wrapper::vgetlane(final_sum.val[1], 6), + }; + + scale_vector_q16x8(pool_info.exclude_padding, res, id, 0, 1, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + fres = wrapper::vmovn(res); } - }; - const q16x8x2_t vrsum_shifted_1 = - { + else { - wrapper::vext_1(vrsum.val[0], vrsum.val[1]), - wrapper::vext_1(vrsum.val[1], vrsum.val[1]) + // Scale lower result + scale_vector_q16x8(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + // Scale lower result + scale_vector_q16x8(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1])); } - }; - const q16x8x2_t vrsum_shifted_2 = + } + else { + const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data); + const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data); + const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data); + const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2); + + if (pool_stride_x == 2) { - wrapper::vext_2(vrsum.val[0], vrsum.val[1]), - wrapper::vext_2(vrsum.val[1], vrsum.val[1]) + const q8x8x2_t table = {{wrapper::vgetlow(final_max), wrapper::vgethigh(final_max)}}; + static const q8x8_t lookup_val = {0, 2, 4, 6, 8, 10, 12, 14}; + fres = wrapper::vtbl(table, lookup_val); } - }; - // Calculate final sum - q16x8x2_t final_sum = - { + else { - wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]), - wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]), + fqres = final_max; } - }; - if(pool_stride_x == 2) - { - q16x8_t res = - { - wrapper::vgetlane(final_sum.val[0], 0), - wrapper::vgetlane(final_sum.val[0], 2), - wrapper::vgetlane(final_sum.val[0], 4), - wrapper::vgetlane(final_sum.val[0], 6), - wrapper::vgetlane(final_sum.val[1], 0), - wrapper::vgetlane(final_sum.val[1], 2), - wrapper::vgetlane(final_sum.val[1], 4), - wrapper::vgetlane(final_sum.val[1], 6), - }; - - scale_vector_q16x8(pool_info.exclude_padding, res, id, 0, 1, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - fres = wrapper::vmovn(res); } - else - { - // Scale lower result - scale_vector_q16x8(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - // Scale lower result - scale_vector_q16x8(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1])); - } - } - else - { - const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data); - const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data); - const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data); - const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2); - if(pool_stride_x == 2) + // Store result + if (pool_stride_x == 1) { - const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } }; - static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 }; - fres = wrapper::vtbl(table, lookup_val); + if (src_qinfo != dst_qinfo) + { + fqres = vrequantize_pooling(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), + requant_qinfo); + } + write16_boundary_aware(id.x(), dst_w, wrapper::vgetlow(fqres), + wrapper::vgethigh(fqres), reinterpret_cast(out.ptr())); } else { - fqres = final_max; - } - } - - // Store result - if(pool_stride_x == 1) - { - if(src_qinfo != dst_qinfo) - { - fqres = vrequantize_pooling(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo); - } - write16_boundary_aware(id.x(), dst_w, wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), reinterpret_cast(out.ptr())); - } - else - { - if(src_qinfo != dst_qinfo) - { - fres = vrequantize_pooling(fres, requant_qinfo); + if (src_qinfo != dst_qinfo) + { + fres = vrequantize_pooling(fres, requant_qinfo); + } + write8_boundary_aware(id.x(), dst_w, fres, reinterpret_cast(out.ptr())); } - write8_boundary_aware(id.x(), dst_w, fres, reinterpret_cast(out.ptr())); - } - }, - in, out); + }, + in, out); } template -void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_quantized_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); @@ -697,74 +749,81 @@ void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor * using q16_t = typename wrapper::traits::promote_t; using q32_t = typename wrapper::traits::promote_t; - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits::min(); - const int stridex_in_bytes = static_cast(src->info()->strides_in_bytes().x()); - const int stridey_in_bytes = static_cast(src->info()->strides_in_bytes().y()); - - execute_window_loop(window, [&](const Coordinates & id) - { - T res = std::numeric_limits::min(); - - if(pool_info.pool_type != PoolingType::MAX) + const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits::min(); + const int stridex_in_bytes = static_cast(src->info()->strides_in_bytes().x()); + const int stridey_in_bytes = static_cast(src->info()->strides_in_bytes().y()); + + execute_window_loop( + window, + [&](const Coordinates &id) { - q32_t sres = 0; - - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + T res = std::numeric_limits::min(); - // Perform pooling - for(int y = 0; y < pool_size_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = 0; x < pool_size_x; ++x) + q32_t sres = 0; + + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = 0; y < pool_size_y; ++y) { - const auto in_ptr = reinterpret_cast(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes); + for (int x = 0; x < pool_size_x; ++x) + { + const auto in_ptr = reinterpret_cast( + in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes); - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr; - sres += data; + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr; + sres += data; + } } + // Divide by scale + res = static_cast(support::cpp11::round(sres * scale)); } - // Divide by scale - res = static_cast(support::cpp11::round(sres * scale)); - } - else - { - for(int y = 0; y < pool_size_y; ++y) + else { - for(int x = 0; x < pool_size_x; ++x) + for (int y = 0; y < pool_size_y; ++y) { - const auto in_ptr = reinterpret_cast(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes); + for (int x = 0; x < pool_size_x; ++x) + { + const auto in_ptr = reinterpret_cast( + in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes); - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr; - res = std::max(res, data); + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr; + res = std::max(res, data); + } } } - } - // Store result - res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper::quantize(Qasymm8QuantizationHelper::dequantize(res, src_qinfo), dst_qinfo) : res; - *(reinterpret_cast(out.ptr())) = res; - }, - in, out); + // Store result + res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper::quantize( + Qasymm8QuantizationHelper::dequantize(res, src_qinfo), dst_qinfo) + : res; + *(reinterpret_cast(out.ptr())) = res; + }, + in, out); } #endif /* defined(ENABLE_NCHW_KERNELS) */ } // namespace cpu diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h index 013e25537c..ce89199b5d 100644 --- a/src/cpu/kernels/pool3d/neon/impl.h +++ b/src/cpu/kernels/pool3d/neon/impl.h @@ -25,9 +25,10 @@ #define SRC_CORE_POOLING_3D_LAYER_IMPL_H #include "arm_compute/core/Helpers.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/PoolingHelpers.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool3d/neon/quantized.h" namespace arm_compute @@ -37,8 +38,13 @@ namespace cpu namespace { template -void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, - const int window_start_x, const int window_end_x, const int window_step_x) +void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) { using vtype = wrapper::traits::neon_bitvector; @@ -71,80 +77,87 @@ void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d Iterator out(dst0, window_out); vector_type vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - int x_off = window_start_x; + int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - vres = wrapper::vdup_n(static_cast(-std::numeric_limits::infinity()), tag_type()); - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + vres = wrapper::vdup_n(static_cast(-std::numeric_limits::infinity()), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); - vres = wrapper::vmax(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); + vres = wrapper::vmax(vres, data); + } } } + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); } - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); - } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res(0); - res = -std::numeric_limits::infinity(); - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res(0); + res = -std::numeric_limits::infinity(); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast(in_ptr_x) + x_off); - res = std::max(res, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast(in_ptr_x) + x_off); + res = std::max(res, data); + } } } + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; } - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - }, - out); + }, + out); } template -void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, - const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x) +void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) { using vtype = wrapper::traits::neon_bitvector; using vector_type = typename vtype::type; @@ -183,95 +196,103 @@ void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d Iterator out(dst0, window_out); vector_type vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; - - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - - // Calculate scale - const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, - pool_pad_top, pool_pad_front, pool_stride_x, - pool_stride_y, pool_stride_z); - const vector_type scale_v = wrapper::vdup_n(static_cast(scale), tag_type()); + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; + + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); + const vector_type scale_v = wrapper::vdup_n(static_cast(scale), tag_type()); - int x_off = window_start_x; + int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - // Perform pooling - vres = wrapper::vdup_n(static_cast(0.0f), tag_type()); - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Perform pooling + vres = wrapper::vdup_n(static_cast(0.0f), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); - vres = wrapper::vadd(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); + vres = wrapper::vadd(vres, data); + } } } - } - // Divide by scale - vres = wrapper::vmul(vres, scale_v); + // Divide by scale + vres = wrapper::vmul(vres, scale_v); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res(0); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); + } - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res(0); + + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast(in_ptr_x) + x_off); - res += data; + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast(in_ptr_x) + x_off); + res += data; + } } } - } - // Divide by scale - res *= scale; + // Divide by scale + res *= scale; - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - }, - out); + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; + } + }, + out); } template -void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, - const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x) +void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) { using vtype = wrapper::traits::neon_bitvector; using vector_type = typename vtype::type; @@ -310,97 +331,100 @@ void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dL Iterator out(dst0, window_out); vector_type vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - // Calculate scale - const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, - pool_pad_top, pool_pad_front, pool_stride_x, - pool_stride_y, pool_stride_z); + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); - int x_off = window_start_x; + int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - // Perform pooling - vres = wrapper::vdup_n(static_cast(0.0f), tag_type()); - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Perform pooling + vres = wrapper::vdup_n(static_cast(0.0f), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); - vres = wrapper::vmla(vres, data, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); + vres = wrapper::vmla(vres, data, data); + } } } - } - - const vector_type scale_v = wrapper::vdup_n(static_cast(scale), tag_type()); - // Divide by scale - vres = wrapper::vmul(vres, scale_v); + const vector_type scale_v = wrapper::vdup_n(static_cast(scale), tag_type()); - // Calculate square-root - vres = wrapper::vinv(wrapper::vinvsqrt(vres)); + // Divide by scale + vres = wrapper::vmul(vres, scale_v); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); - } + // Calculate square-root + vres = wrapper::vinv(wrapper::vinvsqrt(vres)); - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res(0); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); + } - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res(0); + + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast(in_ptr_x) + x_off); - res += data * data; + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast(in_ptr_x) + x_off); + res += data * data; + } } } - } - // Divide by scale - res *= scale; + // Divide by scale + res *= scale; - // Square root - res = std::sqrt(res); + // Square root + res = std::sqrt(res); - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - }, - out); + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; + } + }, + out); } } // namespace @@ -415,16 +439,19 @@ void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye // Needed to handle loop left-over window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - switch(pool_info.pool_type) + switch (pool_info.pool_type) { case PoolingType::MAX: - max_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x); + max_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); break; case PoolingType::AVG: - avg_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x); + avg_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); break; case PoolingType::L2: - l2_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x); + l2_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); break; default: ARM_COMPUTE_ERROR("Pool operation not supported"); @@ -440,7 +467,7 @@ void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye // Needed to handle loop left-over window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - switch(pool_info.pool_type) + switch (pool_info.pool_type) { case PoolingType::MAX: max_poolingMxNxD_q8_neon_ndhwc(src, dst0, pool_info, window_out, window_step_x); diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h index ac14f5eafa..8819907901 100644 --- a/src/cpu/kernels/pool3d/neon/quantized.h +++ b/src/cpu/kernels/pool3d/neon/quantized.h @@ -26,17 +26,18 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/PoolingHelpers.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace cpu { template -void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, - const int window_step_x) +void avg_poolingMxNxD_q8_neon_ndhwc( + const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x) { using q8x8_t = typename wrapper::traits::neon_vector::type; @@ -89,144 +90,147 @@ void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; // "new_offset" doesn't have to consider the "half_scale_v" in its computation // With a requantization performed in a single step there won't be uncertainties introduced - const int32_t new_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / quant_rescale); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; + const int32_t new_offset = + dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / quant_rescale); - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - // Calculate scale - const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, - pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); - int x_off = window_start_x; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - q32x4_t vres1 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres2 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres3 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres4 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + int x_off = window_start_x; - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q32x4_t vres1 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres2 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres3 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres4 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x16_t data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); - - const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); - const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); - vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); - vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); - vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); - vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x16_t data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); + + const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); + const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); + vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); + vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); + vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); + vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + } } } - } - if(src_qinfo != dst_qinfo) - { - const float32x4x4_t vres = + if (src_qinfo != dst_qinfo) { - { + const float32x4x4_t vres = {{ vcvtq_f32_q32(vres1), vcvtq_f32_q32(vres2), vcvtq_f32_q32(vres3), vcvtq_f32_q32(vres4), - } - }; - const auto requantized_dst = vrequantize_pooling_with_scale(vres, quant_rescale, scale, new_offset); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); - } - else - { - const float32x4_t scale_v = vdupq_n_f32(scale); - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - vres1 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); - vres2 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); - vres3 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); - vres4 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); - - const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); - const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, res1); - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, res2); + }}; + const auto requantized_dst = + vrequantize_pooling_with_scale(vres, quant_rescale, scale, new_offset); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); + } + else + { + const float32x4_t scale_v = vdupq_n_f32(scale); + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + vres1 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); + + const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); + const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, res1); + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, res2); + } } - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - q32_t res = static_cast(0.f); - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q32_t res = static_cast(0.f); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast(in_ptr_x) + x_off); - res += data; + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast(in_ptr_x) + x_off); + res += data; + } } } - } - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast(res); - const float new_scale = quant_rescale / scale; - const auto requantized_dst = quantize(res_f, UniformQuantizationInfo(new_scale, new_offset)); + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast(res); + const float new_scale = quant_rescale / scale; + const auto requantized_dst = quantize(res_f, UniformQuantizationInfo(new_scale, new_offset)); - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = requantized_dst; - } - else - { - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - res = static_cast(0.5f + static_cast(res) * scale); + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = requantized_dst; + } + else + { + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + res = static_cast(0.5f + static_cast(res) * scale); - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; + } } - } - }, - out); + }, + out); } template -void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, - const int window_step_x) +void max_poolingMxNxD_q8_neon_ndhwc( + const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x) { using q8x8_t = typename wrapper::traits::neon_vector::type; @@ -266,125 +270,130 @@ void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - int x_off = window_start_x; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - q8x16_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_128_tag{}); + int x_off = window_start_x; - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q8x16_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_128_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x16_t data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); - - vres = wrapper::vmax(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x16_t data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); + + vres = wrapper::vmax(vres, data); + } } } - } - - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling(wrapper::vgetlow(vres), wrapper::vgethigh(vres), - requant_qinfo) : - vres); - } - // Leftovers using half the window step - for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) - { - q8x8_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_64_tag{}); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) + ? vrequantize_pooling(wrapper::vgetlow(vres), + wrapper::vgethigh(vres), requant_qinfo) + : vres); + } - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + // Leftovers using half the window step + for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q8x8_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_64_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x8_t data = wrapper::vload(reinterpret_cast(in_ptr_x) + x_off); - - vres = wrapper::vmax(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x8_t data = wrapper::vload(reinterpret_cast(in_ptr_x) + x_off); + + vres = wrapper::vmax(vres, data); + } } } - } - - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, - (src_qinfo != dst_qinfo) ? vrequantize_pooling(vres, requant_qinfo) : vres); - } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res = std::numeric_limits::min(); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) ? vrequantize_pooling(vres, requant_qinfo) : vres); + } - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res = std::numeric_limits::min(); + + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast(in_ptr_x) + x_off); - - res = std::max(res, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast(in_ptr_x) + x_off); + + res = std::max(res, data); + } } } - } - // Store result - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast(res); - *(reinterpret_cast(out.ptr()) + x_off) = quantize(res_f, requant_qinfo); - } - else - { - *(reinterpret_cast(out.ptr()) + x_off) = res; + // Store result + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast(res); + *(reinterpret_cast(out.ptr()) + x_off) = quantize(res_f, requant_qinfo); + } + else + { + *(reinterpret_cast(out.ptr()) + x_off) = res; + } } - } - }, - out); + }, + out); } } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H \ No newline at end of file +#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H diff --git a/src/cpu/kernels/range/generic/neon/fp16.cpp b/src/cpu/kernels/range/generic/neon/fp16.cpp index 5d50dce907..505c18c27d 100644 --- a/src/cpu/kernels/range/generic/neon/fp16.cpp +++ b/src/cpu/kernels/range/generic/neon/fp16.cpp @@ -23,10 +23,10 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -#include "src/cpu/kernels/range/generic/neon/impl.h" - #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/range/generic/neon/impl.h" namespace arm_compute { diff --git a/src/cpu/kernels/range/generic/neon/fp32.cpp b/src/cpu/kernels/range/generic/neon/fp32.cpp index 6044f0f886..e5e472abb5 100644 --- a/src/cpu/kernels/range/generic/neon/fp32.cpp +++ b/src/cpu/kernels/range/generic/neon/fp32.cpp @@ -22,10 +22,10 @@ * SOFTWARE. */ -#include "src/cpu/kernels/range/generic/neon/impl.h" - #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/range/generic/neon/impl.h" namespace arm_compute { diff --git a/src/cpu/kernels/range/generic/neon/impl.h b/src/cpu/kernels/range/generic/neon/impl.h index 62144e6776..f8c30d52a0 100644 --- a/src/cpu/kernels/range/generic/neon/impl.h +++ b/src/cpu/kernels/range/generic/neon/impl.h @@ -26,8 +26,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -47,35 +48,36 @@ void neon_range_function(ITensor *output, float start, float step, const Window const auto window_end_x = static_cast(window.x().end()); const int window_step_x = 16 / sizeof(T); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator output_it(output, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - const auto out_ptr = reinterpret_cast(output_it.ptr()); - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - for(int count = 0; count < window_step_x; ++count) + int x = window_start_x; + const auto out_ptr = reinterpret_cast(output_it.ptr()); + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - id_vec = wrapper::vsetlane(static_cast(x + count), id_vec, count); - } - - // start + step * id - const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec); - wrapper::vstore(out_ptr + x, res_vec); - } + for (int count = 0; count < window_step_x; ++count) + { + id_vec = wrapper::vsetlane(static_cast(x + count), id_vec, count); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto res = start + x * step; - *(out_ptr + x) = res; - } + // start + step * id + const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec); + wrapper::vstore(out_ptr + x, res_vec); + } - }, - output_it); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto res = start + x * step; + *(out_ptr + x) = res; + } + }, + output_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/range/list.h b/src/cpu/kernels/range/list.h index 25d52bfe7f..cade91e8dd 100644 --- a/src/cpu/kernels/range/list.h +++ b/src/cpu/kernels/range/list.h @@ -28,8 +28,7 @@ namespace arm_compute { namespace cpu { -#define DECLARE_RANGE_KERNEL(func_name) \ - void func_name(ITensor *output, float start, float step, const Window &window) +#define DECLARE_RANGE_KERNEL(func_name) void func_name(ITensor *output, float start, float step, const Window &window) DECLARE_RANGE_KERNEL(fp16_neon_range_function); DECLARE_RANGE_KERNEL(fp32_neon_range_function); diff --git a/src/cpu/kernels/roialign/generic/neon/fp16.cpp b/src/cpu/kernels/roialign/generic/neon/fp16.cpp index c265d5d4eb..cf99830562 100644 --- a/src/cpu/kernels/roialign/generic/neon/fp16.cpp +++ b/src/cpu/kernels/roialign/generic/neon/fp16.cpp @@ -29,7 +29,12 @@ namespace arm_compute { namespace cpu { -void neon_fp16_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +void neon_fp16_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) { return roi_align(input, output, rois, pool_info, window, info); } diff --git a/src/cpu/kernels/roialign/generic/neon/fp32.cpp b/src/cpu/kernels/roialign/generic/neon/fp32.cpp index 51355aaef0..c1dba99b5e 100644 --- a/src/cpu/kernels/roialign/generic/neon/fp32.cpp +++ b/src/cpu/kernels/roialign/generic/neon/fp32.cpp @@ -26,7 +26,12 @@ namespace arm_compute { namespace cpu { -void neon_fp32_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +void neon_fp32_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) { return roi_align(input, output, rois, pool_info, window, info); } diff --git a/src/cpu/kernels/roialign/generic/neon/impl.h b/src/cpu/kernels/roialign/generic/neon/impl.h index e5e604330a..db2f67705d 100644 --- a/src/cpu/kernels/roialign/generic/neon/impl.h +++ b/src/cpu/kernels/roialign/generic/neon/impl.h @@ -46,7 +46,7 @@ inline input_data_type roi_align_1x1(const ITensor *input, float region_end_y, int pz) { - if((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) + if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) { return input_data_type(0); } @@ -55,9 +55,9 @@ inline input_data_type roi_align_1x1(const ITensor *input, const DataLayout data_layout = input->info()->data_layout(); float avg = 0; // Iterate through the aligned pooling region - for(int iy = 0; iy < grid_size_y; ++iy) + for (int iy = 0; iy < grid_size_y; ++iy) { - for(int ix = 0; ix < grid_size_x; ++ix) + for (int ix = 0; ix < grid_size_x; ++ix) { // Align the window in the middle of every bin float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y); @@ -78,20 +78,28 @@ inline input_data_type roi_align_1x1(const ITensor *input, const float w2 = hy * lx; const float w3 = ly * hx; const float w4 = ly * lx; - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - const auto data1 = *reinterpret_cast(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))); - const auto data2 = *reinterpret_cast(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))); - const auto data3 = *reinterpret_cast(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))); - const auto data4 = *reinterpret_cast(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))); + const auto data1 = *reinterpret_cast( + input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))); + const auto data2 = *reinterpret_cast( + input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))); + const auto data3 = *reinterpret_cast( + input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))); + const auto data4 = *reinterpret_cast( + input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } else { - const auto data1 = *reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))); - const auto data2 = *reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))); - const auto data3 = *reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))); - const auto data4 = *reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))); + const auto data1 = *reinterpret_cast( + input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))); + const auto data2 = *reinterpret_cast( + input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))); + const auto data3 = *reinterpret_cast( + input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))); + const auto data4 = *reinterpret_cast( + input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } } @@ -117,21 +125,21 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor *input, int pz, const QuantizationInfo &out_qinfo) { - if((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) + if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) { return input_data_type(out_qinfo.uniform().offset); } else { - float avg = 0; - const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform(); - const bool is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type()); - const DataLayout data_layout = input->info()->data_layout(); + float avg = 0; + const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform(); + const bool is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type()); + const DataLayout data_layout = input->info()->data_layout(); // Iterate through the aligned pooling region - for(int iy = 0; iy < grid_size_y; ++iy) + for (int iy = 0; iy < grid_size_y; ++iy) { - for(int ix = 0; ix < grid_size_x; ++ix) + for (int ix = 0; ix < grid_size_x; ++ix) { // Align the window in the middle of every bin float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y); @@ -153,41 +161,89 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor *input, const float w3 = ly * hx; const float w4 = ly * lx; - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - if(is_qasymm_signed) + if (is_qasymm_signed) { - float data1 = dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo); - float data2 = dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo); - float data3 = dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo); - float data4 = dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo); + float data1 = + dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element( + Coordinates(x_low, y_low, pz, roi_batch))), + input_qinfo); + float data2 = + dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element( + Coordinates(x_high, y_low, pz, roi_batch))), + input_qinfo); + float data3 = + dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element( + Coordinates(x_low, y_high, pz, roi_batch))), + input_qinfo); + float data4 = + dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element( + Coordinates(x_high, y_high, pz, roi_batch))), + input_qinfo); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } else { - float data1 = dequantize_qasymm8(*reinterpret_cast(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo); - float data2 = dequantize_qasymm8(*reinterpret_cast(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo); - float data3 = dequantize_qasymm8(*reinterpret_cast(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo); - float data4 = dequantize_qasymm8(*reinterpret_cast(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo); + float data1 = + dequantize_qasymm8(*reinterpret_cast( + input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), + input_qinfo); + float data2 = + dequantize_qasymm8(*reinterpret_cast( + input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), + input_qinfo); + float data3 = + dequantize_qasymm8(*reinterpret_cast( + input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), + input_qinfo); + float data4 = + dequantize_qasymm8(*reinterpret_cast( + input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), + input_qinfo); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } } else { - if(is_qasymm_signed) + if (is_qasymm_signed) { - const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo); - const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo); - const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo); - const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo); + const auto data1 = + dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element( + Coordinates(pz, x_low, y_low, roi_batch))), + input_qinfo); + const auto data2 = + dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element( + Coordinates(pz, x_high, y_low, roi_batch))), + input_qinfo); + const auto data3 = + dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element( + Coordinates(pz, x_low, y_high, roi_batch))), + input_qinfo); + const auto data4 = + dequantize_qasymm8_signed(*reinterpret_cast(input->ptr_to_element( + Coordinates(pz, x_high, y_high, roi_batch))), + input_qinfo); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } else { - const auto data1 = dequantize_qasymm8(*reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo); - const auto data2 = dequantize_qasymm8(*reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo); - const auto data3 = dequantize_qasymm8(*reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo); - const auto data4 = dequantize_qasymm8(*reinterpret_cast(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo); + const auto data1 = + dequantize_qasymm8(*reinterpret_cast( + input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), + input_qinfo); + const auto data2 = + dequantize_qasymm8(*reinterpret_cast( + input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), + input_qinfo); + const auto data3 = + dequantize_qasymm8(*reinterpret_cast( + input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), + input_qinfo); + const auto data4 = + dequantize_qasymm8(*reinterpret_cast( + input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), + input_qinfo); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } } @@ -197,7 +253,7 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor *input, avg /= grid_size_x * grid_size_y; input_data_type res = 0; - if(is_qasymm_signed) + if (is_qasymm_signed) { res = quantize_qasymm8_signed(avg, out_qinfo); } @@ -215,7 +271,12 @@ inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, } template -void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +void roi_align(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); @@ -240,7 +301,7 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo const auto *rois_ptr = reinterpret_cast(rois->buffer()); const QuantizationInfo &rois_qinfo = rois->info()->quantization_info(); - for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) + for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) { const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx]; @@ -252,7 +313,7 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo float x2(qx2); float y1(qy1); float y2(qy2); - if(is_qasymm) + if (is_qasymm) { x1 = dequantize_qasymm16(qx1, rois_qinfo); x2 = dequantize_qasymm16(qx2, rois_qinfo); @@ -267,44 +328,47 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo float bin_size_y = roi_dims_y / pool_info.pooled_height(); // Iterate through all feature maps - for(int ch = 0; ch < input_chanels; ++ch) + for (int ch = 0; ch < input_chanels; ++ch) { // Iterate through all output pixels - for(int py = 0; py < pooled_h; ++py) + for (int py = 0; py < pooled_h; ++py) { - for(int px = 0; px < pooled_w; ++px) + for (int px = 0; px < pooled_w; ++px) { - const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width); - const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height); - const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width); - const float region_end_y = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height); - const int roi_bin_grid_x = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x)); - const int roi_bin_grid_y = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y)); + const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width); + const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height); + const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width); + const float region_end_y = + compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height); + const int roi_bin_grid_x = + (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x)); + const int roi_bin_grid_y = + (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y)); input_data_type out_val(0); - if(is_qasymm) + if (is_qasymm) { out_val = roi_align_1x1_qasymm8( - input, roi_batch, region_start_x, bin_size_x, - roi_bin_grid_x, region_end_x, region_start_y, bin_size_y, - roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info()); + input, roi_batch, region_start_x, bin_size_x, roi_bin_grid_x, region_end_x, region_start_y, + bin_size_y, roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info()); } else { - out_val = roi_align_1x1( - input, roi_batch, region_start_x, bin_size_x, - roi_bin_grid_x, region_end_x, region_start_y, bin_size_y, - roi_bin_grid_y, region_end_y, ch); + out_val = roi_align_1x1(input, roi_batch, region_start_x, bin_size_x, + roi_bin_grid_x, region_end_x, region_start_y, + bin_size_y, roi_bin_grid_y, region_end_y, ch); } - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - auto out_ptr = reinterpret_cast(output->ptr_to_element(Coordinates(px, py, ch, roi_indx))); - *out_ptr = out_val; + auto out_ptr = reinterpret_cast( + output->ptr_to_element(Coordinates(px, py, ch, roi_indx))); + *out_ptr = out_val; } else { - auto out_ptr = reinterpret_cast(output->ptr_to_element(Coordinates(ch, px, py, roi_indx))); - *out_ptr = out_val; + auto out_ptr = reinterpret_cast( + output->ptr_to_element(Coordinates(ch, px, py, roi_indx))); + *out_ptr = out_val; } } } diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp index d6bd9a95ce..11c5770f53 100644 --- a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp @@ -26,7 +26,12 @@ namespace arm_compute { namespace cpu { -void neon_qu8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +void neon_qu8_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) { return roi_align(input, output, rois, pool_info, window, info); } diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp index a839581aff..7f93cc87b3 100644 --- a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp @@ -26,7 +26,12 @@ namespace arm_compute { namespace cpu { -void neon_qs8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +void neon_qs8_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) { return roi_align(input, output, rois, pool_info, window, info); } diff --git a/src/cpu/kernels/roialign/list.h b/src/cpu/kernels/roialign/list.h index 1c71b02488..fdb3c0050d 100644 --- a/src/cpu/kernels/roialign/list.h +++ b/src/cpu/kernels/roialign/list.h @@ -27,9 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_ROIALIGN_KERNEL(func_name) \ - void func_name(const ITensor *input, ITensor *output, const ITensor *rois, \ - ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +#define DECLARE_ROIALIGN_KERNEL(func_name) \ + void func_name(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, \ + const Window &window, const ThreadInfo &info) DECLARE_ROIALIGN_KERNEL(neon_fp32_roialign); DECLARE_ROIALIGN_KERNEL(neon_fp16_roialign); DECLARE_ROIALIGN_KERNEL(neon_qu8_roialign); diff --git a/src/cpu/kernels/scale/neon/fp16.cpp b/src/cpu/kernels/scale/neon/fp16.cpp index 895f42215e..bd01569cc4 100644 --- a/src/cpu/kernels/scale/neon/fp16.cpp +++ b/src/cpu/kernels/scale/neon/fp16.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -41,8 +42,12 @@ namespace arm_compute { namespace { -void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void fp16_neon_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -62,33 +67,46 @@ void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *of const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - int32_t x = window_start_x; - const float16_t *in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - - for(; x <= window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(reinterpret_cast(out.ptr()) + x, - wrapper::vloadq(in_ptr + offset + offset_row + x)); - } - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &id) { - *(reinterpret_cast(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); - } - }, - out); + const int32_t offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const float16_t *in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for (; x < window_end_x; ++x) + { + *(reinterpret_cast(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); } -void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void fp16_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); Iterator out(dst, window); const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; @@ -103,68 +121,97 @@ void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *o win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); Iterator in(src, win_in); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { using ConstType = typename std::conditional::value, half, float16_t>::type; const float16_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const float16_t *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const float16_t *in_ptr = + reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = + (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) + ? *(in_ptr + in_stride_c) + : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_wc) + : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_c + in_stride_wc) + : const_border_value; + + *reinterpret_cast(out.ptr()) = + static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h1 * in_stride_wc); + + *reinterpret_cast(out.ptr()) = + static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); } else { ARM_COMPUTE_ERROR("Not implemented"); } } -} +} // namespace namespace cpu { -void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void fp16_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } @@ -172,4 +219,4 @@ void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, c } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/scale/neon/integer.cpp b/src/cpu/kernels/scale/neon/integer.cpp index 2ab14cf83a..bbf92e0412 100644 --- a/src/cpu/kernels/scale/neon/integer.cpp +++ b/src/cpu/kernels/scale/neon/integer.cpp @@ -22,8 +22,9 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/ScaleHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -33,8 +34,12 @@ namespace arm_compute { namespace { -void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void u8_neon_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -54,43 +59,58 @@ void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offs const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - int32_t x = window_start_x; - const uint8_t *in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - - for(; x <= window_end_x - window_step_x; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - wrapper::vstore(reinterpret_cast(out.ptr()) + x, - wrapper::vloadq(in_ptr + offset + offset_row + x)); - } - for(; x < window_end_x; ++x) - { - *(reinterpret_cast(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); - } - }, - out); + const int32_t offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const uint8_t *in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for (; x < window_end_x; ++x) + { + *(reinterpret_cast(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); } -void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void u8_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); const int input_width = src->info()->dimension(1); const int input_height = src->info()->dimension(2); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_stride_wc = in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_stride_wc = + in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom); // Don't increment in Y and Z direction for the input tensor // A pointer to the start of this plane is needed as base for the precomputed offsets @@ -100,24 +120,37 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off Iterator in(src, win_in); const uint8_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset); - const uint8_t *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset); + const uint8_t *in_ptr = + reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height) + ? *in_ptr + : const_border_value; + const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height) + ? *(in_ptr + in_stride_c) + : const_border_value; + const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1) + ? *(in_ptr + in_stride_wc) + : const_border_value; + const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1) + ? *(in_ptr + in_stride_c + in_stride_wc) + : const_border_value; + + *reinterpret_cast(out.ptr()) = + static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; @@ -152,12 +185,12 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const float fp_coord_offset_y = sampling_offset * (scale_y - 1); const float fp_coord_offset_x = sampling_offset * (scale_x - 1); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const uint8_t *in_ptr = in.ptr() + bo * in_stride_b; uint8_t *out_ptr = out.ptr() + bo * out_stride_b; - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = yo * scale_y + fp_coord_offset_y; @@ -174,7 +207,7 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y; - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = xo * scale_x + fp_coord_offset_x; @@ -205,7 +238,7 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); @@ -270,19 +303,21 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const auto out_2_int = wrapper::vcvta(out_2); const auto out_3_int = wrapper::vcvta(out_3); #else // defined(__aarch64__) && !defined(BARE_METAL) - const auto out_0_int = wrapper::vcvt(out_0); - const auto out_1_int = wrapper::vcvt(out_1); - const auto out_2_int = wrapper::vcvt(out_2); - const auto out_3_int = wrapper::vcvt(out_3); + const auto out_0_int = wrapper::vcvt(out_0); + const auto out_1_int = wrapper::vcvt(out_1); + const auto out_2_int = wrapper::vcvt(out_2); + const auto out_3_int = wrapper::vcvt(out_3); #endif // defined(__aarch64__) && !defined(BARE_METAL) - const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); - const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); - const auto out = wrapper::vcombine(low_part, high_part); + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); @@ -311,18 +346,27 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off } } -void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void s8_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, offsets, constant_border_value); - if(border_mode == BorderMode::REPLICATE) + if (border_mode == BorderMode::REPLICATE) { using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); const int in_stride_x = src->info()->strides_in_bytes()[1]; const int in_stride_y = src->info()->strides_in_bytes()[2]; @@ -356,12 +400,12 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const float fp_coord_offset_y = sampling_offset * (scale_y - 1); const float fp_coord_offset_x = sampling_offset * (scale_x - 1); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const int8_t *in_ptr = reinterpret_cast(in.ptr() + bo * in_stride_b); int8_t *out_ptr = reinterpret_cast(out.ptr() + bo * out_stride_b); - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = yo * scale_y + fp_coord_offset_y; @@ -378,7 +422,7 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; int8_t *out_ptr_yo = out_ptr + yo * out_stride_y; - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = xo * scale_x + fp_coord_offset_x; @@ -409,7 +453,7 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); @@ -479,14 +523,16 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const auto out_2_int = wrapper::vcvt(out_2); const auto out_3_int = wrapper::vcvt(out_3); #endif // defined(__aarch64__) && !defined(BARE_METAL) - const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); - const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); - const auto out = wrapper::vcombine(low_part, high_part); + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); @@ -515,8 +561,12 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off } } -void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void s16_neon_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -536,33 +586,46 @@ void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - int32_t x = window_start_x; - const int16_t *in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - - for(; x <= window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(reinterpret_cast(out.ptr()) + x, - wrapper::vloadq(in_ptr + offset + offset_row + x)); - } - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &id) { - *(reinterpret_cast(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); - } - }, - out); + const int32_t offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const int16_t *in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for (; x < window_end_x; ++x) + { + *(reinterpret_cast(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); } -void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void s16_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); Iterator out(dst, window); const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; @@ -577,64 +640,93 @@ void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *of win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); Iterator in(src, win_in); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { const int16_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const int16_t *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const int16_t *in_ptr = + reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = + (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) + ? *(in_ptr + in_stride_c) + : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_wc) + : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_c + in_stride_wc) + : const_border_value; + + *reinterpret_cast(out.ptr()) = + static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - const auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - const auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - const auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - const auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + const auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); + const auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); + const auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); + const auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = + *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h1 * in_stride_wc); + + *reinterpret_cast(out.ptr()) = + static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); } else { ARM_COMPUTE_ERROR("Not implemented"); } } -} +} // namespace namespace cpu { -void s8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void s8_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } else { @@ -642,32 +734,50 @@ void s8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, con } } -void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void u8_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } } -void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void s16_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/neon/list.h b/src/cpu/kernels/scale/neon/list.h index 28a1087224..0fe87d15a6 100644 --- a/src/cpu/kernels/scale/neon/list.h +++ b/src/cpu/kernels/scale/neon/list.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" + #include "src/core/NEON/wrapper/wrapper.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -34,10 +35,10 @@ namespace arm_compute { namespace cpu { -#define DECLARE_SCALE_KERNEL(func_name) \ - void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \ - bool align_corners, const Window &window) +#define DECLARE_SCALE_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, \ + float sampling_offset, bool align_corners, const Window &window) DECLARE_SCALE_KERNEL(s16_neon_scale); DECLARE_SCALE_KERNEL(u8_neon_scale); @@ -48,14 +49,20 @@ DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale); #undef DECLARE_SCALE_KERNEL template -void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset, - bool align_corners, const Window &window) +void nearest_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(offsets); // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); const int in_stride_y = src->info()->strides_in_bytes()[1]; const int in_stride_z = src->info()->strides_in_bytes()[2]; @@ -84,17 +91,17 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets const int bo_end = window_execution[3].end(); const int bo_step = window_execution[3].step(); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const uint8_t *in_ptr_base = in.ptr() + bo * in_stride_w; uint8_t *out_ptr_base = out.ptr() + bo * out_stride_w; - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate float yi_f = ((yo + sampling_offset) * scale_y); int yi = 0; - if(align_corners) + if (align_corners) { yi = utils::rounding::round_half_away_from_zero(yi_f); } @@ -103,12 +110,12 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets yi = static_cast(std::floor(yi_f)); } - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate float xi_f = ((xo + sampling_offset) * scale_x); int xi = 0; - if(align_corners) + if (align_corners) { xi = utils::rounding::round_half_away_from_zero(xi_f); } @@ -121,15 +128,15 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { auto out0 = wrapper::vloadq(reinterpret_cast(in_ptr + cout * sizeof(T))); wrapper::vstore(reinterpret_cast(out_ptr + cout * sizeof(T)), out0); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { - auto out0 = *(reinterpret_cast(in_ptr + cout * sizeof(T))); + auto out0 = *(reinterpret_cast(in_ptr + cout * sizeof(T))); *(reinterpret_cast(out_ptr + cout * sizeof(T))) = out0; } } @@ -138,9 +145,16 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets } template -void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void bilinear_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(offsets); ARM_COMPUTE_UNUSED(dx); @@ -148,8 +162,10 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); const int in_stride_y = src->info()->strides_in_bytes()[1]; const int in_stride_z = src->info()->strides_in_bytes()[2]; @@ -180,7 +196,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset const int bo_end = window_execution[3].end(); const int bo_step = window_execution[3].step(); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC using ConstType = typename std::conditional::value, half, T>::type; @@ -189,12 +205,12 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ const T const_border_value = static_cast(constant_border_value.get()); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const uint8_t *in_ptr_base = in.ptr() + bo * in_stride_w; uint8_t *out_ptr_base = out.ptr() + bo * out_stride_w; - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset); @@ -204,7 +220,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset const auto a1 = (yi_f - static_cast(yi)); const auto b1 = (1.f - a1); - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset); @@ -223,32 +239,35 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { auto in00 = wrapper::vdup_n(static_cast(const_border_value), ExactTagType{}); auto in01 = wrapper::vdup_n(static_cast(const_border_value), ExactTagType{}); auto in10 = wrapper::vdup_n(static_cast(const_border_value), ExactTagType{}); auto in11 = wrapper::vdup_n(static_cast(const_border_value), ExactTagType{}); - if((yi >= 0) && (yi < in_dim_h)) + if ((yi >= 0) && (yi < in_dim_h)) { - if((xi >= 0) && (xi < in_dim_w)) + if ((xi >= 0) && (xi < in_dim_w)) { in00 = wrapper::vloadq(reinterpret_cast(in_ptr + cout * sizeof(T))); } - if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) { - in01 = wrapper::vloadq(reinterpret_cast(in_ptr + cout * sizeof(T) + in_stride_y)); + in01 = wrapper::vloadq( + reinterpret_cast(in_ptr + cout * sizeof(T) + in_stride_y)); } } - if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h)) + if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h)) { - if((xi >= 0) && (xi < in_dim_w)) + if ((xi >= 0) && (xi < in_dim_w)) { - in10 = wrapper::vloadq(reinterpret_cast(in_ptr + cout * sizeof(T) + in_stride_z)); + in10 = wrapper::vloadq( + reinterpret_cast(in_ptr + cout * sizeof(T) + in_stride_z)); } - if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) { - in11 = wrapper::vloadq(reinterpret_cast(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z)); + in11 = wrapper::vloadq( + reinterpret_cast(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z)); } } @@ -264,32 +283,33 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset wrapper::vstore(reinterpret_cast(out_ptr + cout * sizeof(T)), out0); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { auto in00 = static_cast(const_border_value); auto in01 = static_cast(const_border_value); auto in10 = static_cast(const_border_value); auto in11 = static_cast(const_border_value); - if((yi >= 0) && (yi < in_dim_h)) + if ((yi >= 0) && (yi < in_dim_h)) { - if((xi >= 0) && (xi < in_dim_w)) + if ((xi >= 0) && (xi < in_dim_w)) { in00 = *(reinterpret_cast(in_ptr + cout * sizeof(T))); } - if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) { in01 = *(reinterpret_cast(in_ptr + cout * sizeof(T) + in_stride_y)); } } - if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h)) + if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h)) { - if((xi >= 0) && (xi < in_dim_w)) + if ((xi >= 0) && (xi < in_dim_w)) { in10 = *(reinterpret_cast(in_ptr + cout * sizeof(T) + in_stride_z)); } - if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) { - in11 = *(reinterpret_cast(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z)); + in11 = *( + reinterpret_cast(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z)); } } auto out0 = static_cast(0); @@ -303,14 +323,14 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset } } } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const uint8_t *in_ptr = in.ptr() + bo * in_stride_w; uint8_t *out_ptr = out.ptr() + bo * out_stride_w; - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset); @@ -327,7 +347,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset const int yi1_offset = yi1 * in_stride_z; const int y_offset = yo * out_stride_z; - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset); @@ -356,12 +376,16 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset const int offset = xo * out_stride_y + y_offset; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { - const auto in00 = wrapper::vloadq(reinterpret_cast(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset)); - const auto in01 = wrapper::vloadq(reinterpret_cast(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset)); - const auto in10 = wrapper::vloadq(reinterpret_cast(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset)); - const auto in11 = wrapper::vloadq(reinterpret_cast(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset)); + const auto in00 = wrapper::vloadq( + reinterpret_cast(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset)); + const auto in01 = wrapper::vloadq( + reinterpret_cast(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset)); + const auto in10 = wrapper::vloadq( + reinterpret_cast(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset)); + const auto in11 = wrapper::vloadq( + reinterpret_cast(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset)); auto out0 = wrapper::vmul(in00, s00); out0 = wrapper::vmla(out0, in01, s01); @@ -370,12 +394,16 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset wrapper::vstore(reinterpret_cast(out_ptr + offset + cout * sizeof(T)), out0); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { - const T in00 = *(reinterpret_cast(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset)); - const T in01 = *(reinterpret_cast(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset)); - const T in10 = *(reinterpret_cast(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset)); - const T in11 = *(reinterpret_cast(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset)); + const T in00 = + *(reinterpret_cast(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset)); + const T in01 = + *(reinterpret_cast(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset)); + const T in10 = + *(reinterpret_cast(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset)); + const T in11 = + *(reinterpret_cast(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset)); T out0 = in00 * s00_s; out0 += in01 * s01_s; @@ -394,15 +422,24 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset } template -void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void common_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - bilinear_neon_scale(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + bilinear_neon_scale(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { nearest_neon_scale(src, dst, offsets, sampling_offset, align_corners, window); } diff --git a/src/cpu/kernels/scale/neon/qasymm8.cpp b/src/cpu/kernels/scale/neon/qasymm8.cpp index 778459ae39..62a821daa5 100644 --- a/src/cpu/kernels/scale/neon/qasymm8.cpp +++ b/src/cpu/kernels/scale/neon/qasymm8.cpp @@ -28,9 +28,16 @@ namespace arm_compute { namespace { -void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { // Data layout is NHWC const int32_t input_width = src->info()->dimension(1); @@ -40,10 +47,12 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { const int32_t in_stride_y = src->info()->strides_in_bytes()[1]; const int32_t in_stride_z = src->info()->strides_in_bytes()[2]; @@ -59,7 +68,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor win_in.set(1, Window::Dimension(0, 0, 0)); win_in.set(2, Window::Dimension(0, 0, 0)); - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) { win_off.set(d, Window::Dimension(0, 0, 0)); } @@ -68,36 +77,41 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor Iterator out(dst, window); const uint8_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[1], id[2])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ? - (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ? - (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ? - (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ? - (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset); + const int32_t index_w = + *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[1], id[2])))); + const auto pixel_row_ptr = reinterpret_cast(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + const auto a11 = + (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + + const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); + *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t; using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t; @@ -141,12 +155,12 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor const float fp_coord_offset_y = sampling_offset * (scale_y - 1); const float fp_coord_offset_x = sampling_offset * (scale_x - 1); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const uint8_t *in_ptr = in.ptr() + bo * in_stride_b; uint8_t *out_ptr = out.ptr() + bo * out_stride_b; - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = yo * scale_y + fp_coord_offset_y; @@ -163,7 +177,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y; - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = xo * scale_x + fp_coord_offset_x; @@ -194,7 +208,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); @@ -204,34 +218,82 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor const uint16x8_t in00_low = wrapper::vmovl(wrapper::vgetlow(in00)); const uint16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00)); - const auto in00_0 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)), vscale_in); - const auto in00_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)), vscale_in); - const auto in00_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)), vscale_in); - const auto in00_3 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)), vscale_in); + const auto in00_0 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)), + vscale_in); + const auto in00_1 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)), + vscale_in); + const auto in00_2 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)), + vscale_in); + const auto in00_3 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)), + vscale_in); const uint16x8_t in01_low = wrapper::vmovl(wrapper::vgetlow(in01)); const uint16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01)); - const auto in01_0 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)), vscale_in); - const auto in01_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)), vscale_in); - const auto in01_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)), vscale_in); - const auto in01_3 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)), vscale_in); + const auto in01_0 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)), + vscale_in); + const auto in01_1 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)), + vscale_in); + const auto in01_2 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)), + vscale_in); + const auto in01_3 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)), + vscale_in); const uint16x8_t in10_low = wrapper::vmovl(wrapper::vgetlow(in10)); const uint16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10)); - const auto in10_0 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)), vscale_in); - const auto in10_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)), vscale_in); - const auto in10_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)), vscale_in); - const auto in10_3 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)), vscale_in); + const auto in10_0 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)), + vscale_in); + const auto in10_1 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)), + vscale_in); + const auto in10_2 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)), + vscale_in); + const auto in10_3 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)), + vscale_in); const uint16x8_t in11_low = wrapper::vmovl(wrapper::vgetlow(in11)); const uint16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11)); - const auto in11_0 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)), vscale_in); - const auto in11_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)), vscale_in); - const auto in11_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)), vscale_in); - const auto in11_3 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)), vscale_in); + const auto in11_0 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)), + vscale_in); + const auto in11_1 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)), + vscale_in); + const auto in11_2 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)), + vscale_in); + const auto in11_3 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)), + vscale_in); auto out_0 = wrapper::vmul(in00_0, s00); out_0 = wrapper::vmla(out_0, in01_0, s01); @@ -264,14 +326,16 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor const auto out_2_int = wrapper::vcvt(wrapper::vmla(voffset_o, out_2, invvscale_o)); const auto out_3_int = wrapper::vcvt(wrapper::vmla(voffset_o, out_3, invvscale_o)); #endif // defined(__aarch64__) && !defined(BARE_METAL) - const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); - const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); - const auto out = wrapper::vcombine(low_part, high_part); + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); @@ -292,7 +356,8 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor #if defined(__aarch64__) && !defined(BARE_METAL) *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info); #else // defined(__aarch64__) && !defined(BARE_METAL) - *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO); + *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = + quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO); #endif // defined(__aarch64__) && !defined(BARE_METAL) } } @@ -304,28 +369,38 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor ARM_COMPUTE_ERROR("Not implemented"); } } -} +} // namespace namespace cpu { -void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - if(src->info()->quantization_info() == dst->info()->quantization_info()) + if (src->info()->quantization_info() == dst->info()->quantization_info()) { - u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window); + u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } else { - qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { nearest_neon_scale(src, dst, offsets, sampling_offset, align_corners, window); } } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp index cd63dfba63..5a885178a7 100644 --- a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp @@ -28,9 +28,16 @@ namespace arm_compute { namespace { -void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_signed_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { // Data layout is NHWC const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); @@ -40,10 +47,12 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const const int32_t input_height = src->info()->dimension(2); // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { const int32_t in_stride_y = src->info()->strides_in_bytes()[1]; const int32_t in_stride_z = src->info()->strides_in_bytes()[2]; @@ -58,7 +67,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const win_in.set(1, Window::Dimension(0, 0, 0)); win_in.set(2, Window::Dimension(0, 0, 0)); - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) { win_off.set(d, Window::Dimension(0, 0, 0)); } @@ -67,36 +76,41 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const Iterator out(dst, window); const int8_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[1], id[2])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ? - (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ? - (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ? - (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ? - (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset); + const int32_t index_w = + *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[1], id[2])))); + const auto pixel_row_ptr = reinterpret_cast(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + const auto a11 = + (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + + const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); + *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t; using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t; @@ -140,12 +154,12 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const const float32x4_t invvscale_o = wrapper::vdup_n(1.f / oq_info.scale, FloatTagType{}); const float32x4_t voffset_o = vdupq_n_f32(oq_info.offset); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const int8_t *in_ptr = reinterpret_cast(in.ptr() + bo * in_stride_b); int8_t *out_ptr = reinterpret_cast(out.ptr() + bo * out_stride_b); - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = yo * scale_y + fp_coord_offset_y; @@ -162,7 +176,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; int8_t *out_ptr_yo = out_ptr + yo * out_stride_y; - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = xo * scale_x + fp_coord_offset_x; @@ -193,7 +207,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); @@ -203,34 +217,70 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const const int16x8_t in00_low = wrapper::vmovl(wrapper::vgetlow(in00)); const int16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00)); - const auto in00_0 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)), vscale_in); - const auto in00_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)), vscale_in); - const auto in00_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)), vscale_in); - const auto in00_3 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)), vscale_in); + const auto in00_0 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)), + vscale_in); + const auto in00_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)), + vscale_in); + const auto in00_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)), + vscale_in); + const auto in00_3 = + wrapper::vmul(wrapper::vcvt( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)), + vscale_in); const int16x8_t in01_low = wrapper::vmovl(wrapper::vgetlow(in01)); const int16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01)); - const auto in01_0 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)), vscale_in); - const auto in01_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)), vscale_in); - const auto in01_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)), vscale_in); - const auto in01_3 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)), vscale_in); + const auto in01_0 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)), + vscale_in); + const auto in01_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)), + vscale_in); + const auto in01_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)), + vscale_in); + const auto in01_3 = + wrapper::vmul(wrapper::vcvt( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)), + vscale_in); const int16x8_t in10_low = wrapper::vmovl(wrapper::vgetlow(in10)); const int16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10)); - const auto in10_0 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)), vscale_in); - const auto in10_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)), vscale_in); - const auto in10_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)), vscale_in); - const auto in10_3 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)), vscale_in); + const auto in10_0 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)), + vscale_in); + const auto in10_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)), + vscale_in); + const auto in10_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)), + vscale_in); + const auto in10_3 = + wrapper::vmul(wrapper::vcvt( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)), + vscale_in); const int16x8_t in11_low = wrapper::vmovl(wrapper::vgetlow(in11)); const int16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11)); - const auto in11_0 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)), vscale_in); - const auto in11_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)), vscale_in); - const auto in11_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)), vscale_in); - const auto in11_3 = wrapper::vmul(wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)), vscale_in); + const auto in11_0 = wrapper::vmul( + wrapper::vcvt(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)), + vscale_in); + const auto in11_1 = wrapper::vmul(wrapper::vcvt(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)), + vscale_in); + const auto in11_2 = wrapper::vmul(wrapper::vcvt(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)), + vscale_in); + const auto in11_3 = + wrapper::vmul(wrapper::vcvt( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)), + vscale_in); auto out_0 = wrapper::vmul(in00_0, s00); out_0 = wrapper::vmla(out_0, in01_0, s01); @@ -263,14 +313,16 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const const auto out_2_int = wrapper::vcvt(wrapper::vmla(voffset_o, out_2, invvscale_o)); const auto out_3_int = wrapper::vcvt(wrapper::vmla(voffset_o, out_3, invvscale_o)); #endif // defined(__aarch64__) && !defined(BARE_METAL) - const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); - const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); - const auto out = wrapper::vcombine(low_part, high_part); + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); @@ -291,7 +343,8 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const #if defined(__aarch64__) && !defined(BARE_METAL) *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info); #else // defined(__aarch64__) && !defined(BARE_METAL) - *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO); + *(out_ptr_xo_yo + cout * sizeof(int8_t)) = + quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO); #endif // defined(__aarch64__) && !defined(BARE_METAL) } } @@ -303,28 +356,39 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ARM_COMPUTE_ERROR("Not implemented"); } } -} +} // namespace namespace cpu { -void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_signed_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - if(src->info()->quantization_info() == dst->info()->quantization_info() && border_mode == BorderMode::REPLICATE) + if (src->info()->quantization_info() == dst->info()->quantization_info() && + border_mode == BorderMode::REPLICATE) { - s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window); + s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } else { - qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, + sampling_offset, align_corners, window); } } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { nearest_neon_scale(src, dst, offsets, sampling_offset, align_corners, window); } } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/sve/fp16.cpp b/src/cpu/kernels/scale/sve/fp16.cpp index ceda19f366..cb28f4cb1c 100644 --- a/src/cpu/kernels/scale/sve/fp16.cpp +++ b/src/cpu/kernels/scale/sve/fp16.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -41,8 +42,12 @@ namespace arm_compute { namespace { -void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void fp16_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -61,38 +66,50 @@ void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x)); + const int32_t offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast(out.ptr()); - x += svcntw(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - out); -} + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + // Store results + svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(svptrue_b16(), pg)); + }, + out); } +} // namespace namespace cpu { -void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void fp16_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } @@ -103,4 +120,4 @@ void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co } } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/scale/sve/fp32.cpp b/src/cpu/kernels/scale/sve/fp32.cpp index f3472f1efd..cbb345edbb 100644 --- a/src/cpu/kernels/scale/sve/fp32.cpp +++ b/src/cpu/kernels/scale/sve/fp32.cpp @@ -25,23 +25,27 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" +#include #include #include -#include - namespace arm_compute { namespace { -void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void fp32_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -60,38 +64,50 @@ void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b32(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x)); + const int32_t offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast(out.ptr()); - x += svcntw(); - pg = svwhilelt_b32(x, window_end_x); - } - while(svptest_any(svptrue_b32(), pg)); - }, - out); -} + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do + { + // Store results + svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); + } while (svptest_any(svptrue_b32(), pg)); + }, + out); } +} // namespace namespace cpu { -void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void fp32_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } diff --git a/src/cpu/kernels/scale/sve/integer.cpp b/src/cpu/kernels/scale/sve/integer.cpp index 82c70ee360..df950b1789 100644 --- a/src/cpu/kernels/scale/sve/integer.cpp +++ b/src/cpu/kernels/scale/sve/integer.cpp @@ -25,9 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -39,8 +40,12 @@ namespace arm_compute { namespace { -void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void u8_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -59,32 +64,40 @@ void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offse const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); - - x += svcntw(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - out); + const int32_t offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + out); } -void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void s16_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -103,38 +116,50 @@ void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offs const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x)); - - x += svcntw(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - out); -} + const int32_t offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + // Store results + svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(svptrue_b16(), pg)); + }, + out); } +} // namespace namespace cpu { -void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void u8_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } @@ -144,12 +169,20 @@ void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, cons } } -void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void s16_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } diff --git a/src/cpu/kernels/scale/sve/list.h b/src/cpu/kernels/scale/sve/list.h index b9c3a10a78..aff741a4a7 100644 --- a/src/cpu/kernels/scale/sve/list.h +++ b/src/cpu/kernels/scale/sve/list.h @@ -28,10 +28,10 @@ namespace arm_compute { namespace cpu { -#define DECLARE_SCALE_KERNEL(func_name) \ - void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \ - bool align_corners, const Window &window) +#define DECLARE_SCALE_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, \ + float sampling_offset, bool align_corners, const Window &window) DECLARE_SCALE_KERNEL(fp16_sve_scale); DECLARE_SCALE_KERNEL(fp32_sve_scale); diff --git a/src/cpu/kernels/scale/sve/qasymm8.cpp b/src/cpu/kernels/scale/sve/qasymm8.cpp index d45a69e43b..0fc794c6c2 100644 --- a/src/cpu/kernels/scale/sve/qasymm8.cpp +++ b/src/cpu/kernels/scale/sve/qasymm8.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -40,8 +40,12 @@ namespace arm_compute { namespace { -void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void qasymm8_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -60,38 +64,50 @@ void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor * const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); + const int32_t offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast(out.ptr()); - x += svcntw(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - out); -} + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + out); } +} // namespace namespace cpu { -void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } diff --git a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp index 67bca65f58..68ea01e29e 100644 --- a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp +++ b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -40,8 +40,12 @@ namespace arm_compute { namespace { -void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void qasymm8_signed_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -60,38 +64,50 @@ void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const IT const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x)); + const int32_t offset = + *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast(out.ptr()); - x += svcntw(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - out); -} + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + out); } +} // namespace namespace cpu { -void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_signed_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } diff --git a/src/cpu/kernels/select/generic/neon/fp16.cpp b/src/cpu/kernels/select/generic/neon/fp16.cpp index b460213c72..38a58099bd 100644 --- a/src/cpu/kernels/select/generic/neon/fp16.cpp +++ b/src/cpu/kernels/select/generic/neon/fp16.cpp @@ -23,20 +23,22 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -#include "src/cpu/kernels/select/generic/neon/impl.h" - #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/select/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_f16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_f16_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_16(c, x, y, output, window); } -void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_f16_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank(c, x, y, output, window); } @@ -45,4 +47,4 @@ void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITe } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/select/generic/neon/fp32.cpp b/src/cpu/kernels/select/generic/neon/fp32.cpp index 63fd594901..50a80cb338 100644 --- a/src/cpu/kernels/select/generic/neon/fp32.cpp +++ b/src/cpu/kernels/select/generic/neon/fp32.cpp @@ -22,20 +22,22 @@ * SOFTWARE. */ -#include "src/cpu/kernels/select/generic/neon/impl.h" - #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/select/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_f32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_f32_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_32(c, x, y, output, window); } -void neon_f32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_f32_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank(c, x, y, output, window); } diff --git a/src/cpu/kernels/select/generic/neon/impl.h b/src/cpu/kernels/select/generic/neon/impl.h index 6a6d9969f8..7ce640b6ff 100644 --- a/src/cpu/kernels/select/generic/neon/impl.h +++ b/src/cpu/kernels/select/generic/neon/impl.h @@ -25,6 +25,7 @@ #define ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H #include "arm_compute/core/TensorInfo.h" + #include "src/core/NEON/NEAsymm.h" #include "src/cpu/kernels/select/generic/neon/impl.h" @@ -37,8 +38,16 @@ namespace arm_compute namespace cpu { template -void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *)) +void select_op(const ITensor *cond, + const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + const int window_step_x, + const int window_start_x, + const int window_end_x, + const int limit, + VectorType (*condition_conversion)(const uint8_t *)) { Window win = window; win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -48,30 +57,32 @@ void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITen Iterator input2(in2, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto condition_ptr = reinterpret_cast(condition.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - - int x = window_start_x; - for(; x <= limit; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto c = (*condition_conversion)(condition_ptr + x); - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b)); - } - for(; x < window_end_x; ++x) - { - const auto c = *(condition_ptr + x); - const auto a = *(input1_ptr + x); - const auto b = *(input2_ptr + x); - *(output_ptr + x) = static_cast(c) ? a : b; - } - }, - condition, input1, input2, output); + auto output_ptr = reinterpret_cast(output.ptr()); + const auto condition_ptr = reinterpret_cast(condition.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + + int x = window_start_x; + for (; x <= limit; x += window_step_x) + { + const auto c = (*condition_conversion)(condition_ptr + x); + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b)); + } + for (; x < window_end_x; ++x) + { + const auto c = *(condition_ptr + x); + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = static_cast(c) ? a : b; + } + }, + condition, input1, input2, output); } template @@ -81,11 +92,14 @@ void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, IT const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); - select_op(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType - { - static const auto zero = wrapper::vdup_n(static_cast(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero); - }); + select_op( + cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, + [](const uint8_t *condition_ptr) -> VectorType + { + static const auto zero = + wrapper::vdup_n(static_cast(0), arm_compute::wrapper::traits::vector_128_tag()); + return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero); + }); } template @@ -95,11 +109,14 @@ void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, I const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); - select_op(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType - { - static const auto zero = wrapper::vdup_n(static_cast(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero); - }); + select_op( + cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, + [](const uint8_t *condition_ptr) -> VectorType + { + static const auto zero = + wrapper::vdup_n(static_cast(0), arm_compute::wrapper::traits::vector_128_tag()); + return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero); + }); } template @@ -109,15 +126,19 @@ void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, I const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); - select_op(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType - { - static const auto zero = wrapper::vdup_n(static_cast(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero); - }); + select_op( + cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, + [](const uint8_t *condition_ptr) -> VectorType + { + static const auto zero = + wrapper::vdup_n(static_cast(0), arm_compute::wrapper::traits::vector_128_tag()); + return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero); + }); } template -void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void select_op_not_same_rank( + const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { ARM_COMPUTE_UNUSED(window); @@ -131,20 +152,20 @@ void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITen int offset = 0; const int step = 16 / in1->info()->element_size(); - for(int i = 0; i < outer_size; ++i) + for (int i = 0; i < outer_size; ++i) { int x = offset; const auto input_ptr = static_cast(*(condition_ptr + i)) ? input1_ptr : input2_ptr; - for(; x <= offset + inner_size - step; x += step) + for (; x <= offset + inner_size - step; x += step) { wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x)); } - if(x <= offset + inner_size - (step / 2)) + if (x <= offset + inner_size - (step / 2)) { wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x)); x += step / 2; } - for(; x < offset + inner_size; ++x) + for (; x < offset + inner_size; ++x) { *(output_ptr + x) = *(input_ptr + x); } diff --git a/src/cpu/kernels/select/generic/neon/integer.cpp b/src/cpu/kernels/select/generic/neon/integer.cpp index 71b2f0b933..135087c261 100644 --- a/src/cpu/kernels/select/generic/neon/integer.cpp +++ b/src/cpu/kernels/select/generic/neon/integer.cpp @@ -25,59 +25,71 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include - #include "src/cpu/kernels/select/generic/neon/impl.h" +#include + namespace arm_compute { namespace cpu { -void neon_s8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s8_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_8(c, x, y, output, window); } -void neon_s16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s16_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_16(c, x, y, output, window); } -void neon_s32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s32_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_32(c, x, y, output, window); } -void neon_s8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s8_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank(c, x, y, output, window); } -void neon_s16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s16_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank(c, x, y, output, window); } -void neon_s32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s32_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank(c, x, y, output, window); } -void neon_u8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u8_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_8(c, x, y, output, window); } -void neon_u16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u16_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_16(c, x, y, output, window); } -void neon_u32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u32_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_32(c, x, y, output, window); } -void neon_u8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u8_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank(c, x, y, output, window); } -void neon_u16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u16_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank(c, x, y, output, window); } -void neon_u32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u32_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank(c, x, y, output, window); } diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp index f6556696b0..2e2adf33e0 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp @@ -23,6 +23,7 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #include "arm_compute/core/Helpers.h" + #include "src/cpu/CpuTypes.h" #include "src/cpu/kernels/softmax/generic/neon/impl.h" @@ -30,8 +31,13 @@ namespace arm_compute { namespace cpu { -void neon_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_fp16_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_float(in, max, tmp, out, beta, is_log, window); } @@ -40,6 +46,6 @@ void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max(in, out, window); } -} +} // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp index ddd270ae70..61df40c1b5 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_fp32_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_float(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp index f07fd2fb27..5d6e6a4f80 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.cpp +++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/kernels/softmax/generic/neon/impl.h" + #include "support/SaturateCast.h" namespace arm_compute @@ -32,11 +33,10 @@ template void neon_logits_1d_max(const ITensor *in, ITensor *o template void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); template -void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window) +void neon_softmax_logits_1d_quantized( + const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) { - static_assert(std::is_same::value - || std::is_same::value, + static_assert(std::is_same::value || std::is_same::value, "quantized type should be either qasymm8_t or qasymm8_signed_t."); const int start_x = in->info()->valid_region().anchor.x(); @@ -50,163 +50,174 @@ void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi Iterator out_it(out, window); constexpr int vec_size = 16; - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - float sum{}; - float sum_inversed{}; - - /* Compute exponentials and sum */ + execute_window_loop( + window, + [&](const Coordinates &) { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast(tmp); - /* Init sum to zero */ - float32x4x4_t vec_sum = - { - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - }; - - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vqsub(vec_max, vec_elements); - auto vec_elements_flt = convert_int_to_float(vec_elements); + float sum{}; + float sum_inversed{}; - if(is_log) - { - vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); - vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); - vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); - vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); - } - else + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); + + /* Init sum to zero */ + float32x4x4_t vec_sum = { + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + }; + + /* Loop over row and compute exponentials and sum */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); - vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); - vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); - vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + auto vec_elements_flt = convert_int_to_float(vec_elements); + + if (is_log) + { + vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); + vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); + vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); + vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); + } + else + { + vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); + vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); + vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); + vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + } + + vst4q_f32(tmp_ptr + x, vec_elements_flt); } - vst4q_f32(tmp_ptr + x, vec_elements_flt); - } + /* Reduce sum */ + const auto sum_16_byte = + vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); + sum_res = vpadd_f32(sum_res, sum_res); + sum = wrapper::vgetlane(sum_res, 0); - /* Reduce sum */ - const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); - auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); - sum_res = vpadd_f32(sum_res, sum_res); - sum = wrapper::vgetlane(sum_res, 0); + /* Run remaining elements */ + for (; x < input_width; ++x) + { + float element{}; + if (is_log) + { + element = (max_val - in_ptr[x]) * scale_beta; + sum += std::exp(element); + } + else + { + element = std::exp((max_val - in_ptr[x]) * scale_beta); + sum += element; + } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - float element{}; - if(is_log) + tmp_ptr[x] = element; + } + + if (!is_log) { - element = (max_val - in_ptr[x]) * scale_beta; - sum += std::exp(element); + sum_inversed = 256.f / sum; } else { - element = std::exp((max_val - in_ptr[x]) * scale_beta); - sum += element; + sum = std::log(sum); } - - tmp_ptr[x] = element; } - if(!is_log) - { - sum_inversed = 256.f / sum; - } - else + /* Normalize exponentials */ { - sum = std::log(sum); - } - } - - /* Normalize exponentials */ - { - constexpr bool is_qasymm8_signed = std::is_same::value; - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - using int_vec_type = wrapper::traits::neon_vector_t; - float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); - int_vec_type normalized_value{}; - if(is_log) + constexpr bool is_qasymm8_signed = std::is_same::value; + /* Loop over row and compute softmax */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - const float32x4x4_t sub = + using int_vec_type = wrapper::traits::neon_vector_t; + float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); + int_vec_type normalized_value{}; + if (is_log) { - vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), - }; - normalized_value = convert_float_to_int(sub); + const float32x4x4_t sub = { + vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), + }; + normalized_value = convert_float_to_int(sub); + } + else + { + float32x4x4_t mul = { + vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), + }; + + if (is_qasymm8_signed) + { + const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); + mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); + mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); + mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); + mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + } + + normalized_value = convert_float_to_int(mul); + } + wrapper::vstore(out_ptr + x, normalized_value); } - else + /* Run remaining elements */ + for (; x < input_width; ++x) { - float32x4x4_t mul = + if (is_log) { - vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), - }; - - if(is_qasymm8_signed) + out_ptr[x] = utils::cast::saturate_cast(tmp_ptr[x] - sum); + } + else { - const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); - mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); - mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); - mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); - mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + out_ptr[x] = utils::cast::saturate_cast((tmp_ptr[x] * sum_inversed) - + (is_qasymm8_signed ? 128.f : 0)); } - - normalized_value = convert_float_to_int(mul); - } - wrapper::vstore(out_ptr + x, normalized_value); - } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - if(is_log) - { - out_ptr[x] = utils::cast::saturate_cast(tmp_ptr[x] - sum); - } - else - { - out_ptr[x] = utils::cast::saturate_cast((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0)); } } - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } -template void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); -template void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +template void neon_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); +template void neon_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h index 206d36a2e0..4d9b789297 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.h +++ b/src/cpu/kernels/softmax/generic/neon/impl.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -42,53 +43,65 @@ void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input(in, win); Iterator output(out, win); const int sum_stages = log2(window_step_x / 2); - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast(input.ptr()); + const auto out_ptr = reinterpret_cast(output.ptr()); - // Init max value - auto vec_max = wrapper::vdup_n(support::cpp11::lowest(), ExactTagType{}); - int x = window_start_x; + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest(), ExactTagType{}); + int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto current_value = wrapper::vloadq(in_ptr + x); - vec_max = wrapper::vmax(vec_max, current_value); - } - auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); - for(int i = 0; i < sum_stages; ++i) - { - carry_max = wrapper::vpmax(carry_max, carry_max); - } - T max_val = wrapper::vgetlane(carry_max, 0); + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + T max_val = wrapper::vgetlane(carry_max, 0); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; + } - *out_ptr = max_val; - }, - input, output); + *out_ptr = max_val; + }, + input, output); } template -void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +void neon_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); template -void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); @@ -103,113 +116,118 @@ void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *c constexpr int vec_size = 16 / sizeof(T); const int sum_stages = log2(vec_size / 2); - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - T sum{}; - T sum_inversed{}; - - /* Compute exponentials and sum */ + execute_window_loop( + window, + [&](const Coordinates &) { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); - - /* Init sum to zero */ - auto vec_sum = wrapper::vdup_n(static_cast(0), ExactTagType{}); + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast(tmp); - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vsub(vec_elements, vec_max); - if(is_log) - { - vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{})); - vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); - } - else - { - vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{}))); - vec_sum = wrapper::vadd(vec_sum, vec_elements); - } - wrapper::vstore(tmp_ptr + x, vec_elements); - } + T sum{}; + T sum_inversed{}; - /* Reduce sum */ - auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); - for(int i = 0; i < sum_stages; ++i) + /* Compute exponentials and sum */ { - sum_res = wrapper::vpadd(sum_res, sum_res); - } - sum = wrapper::vgetlane(sum_res, 0); + /* Get max value */ + const auto max_val = *reinterpret_cast(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); - /* Run remaining elements */ - for(; x < input_width; ++x) - { - T element{}; + /* Init sum to zero */ + auto vec_sum = wrapper::vdup_n(static_cast(0), ExactTagType{}); - if(is_log) + /* Loop over row and compute exponentials and sum */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - element = (in_ptr[x] - max_val) * beta; - sum += std::exp(element); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vsub(vec_elements, vec_max); + if (is_log) + { + vec_elements = + wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{})); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + } + else + { + vec_elements = wrapper::vexpq( + wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{}))); + vec_sum = wrapper::vadd(vec_sum, vec_elements); + } + wrapper::vstore(tmp_ptr + x, vec_elements); } - else + + /* Reduce sum */ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); + for (int i = 0; i < sum_stages; ++i) { - element = std::exp((in_ptr[x] - max_val) * beta); - sum += element; + sum_res = wrapper::vpadd(sum_res, sum_res); } - tmp_ptr[x] = element; - } + sum = wrapper::vgetlane(sum_res, 0); - if(!is_log) - { - sum_inversed = T(1) / sum; - } - else - { - sum = static_cast(std::log(sum)); - } - } + /* Run remaining elements */ + for (; x < input_width; ++x) + { + T element{}; + + if (is_log) + { + element = (in_ptr[x] - max_val) * beta; + sum += std::exp(element); + } + else + { + element = std::exp((in_ptr[x] - max_val) * beta); + sum += element; + } + tmp_ptr[x] = element; + } - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_in = wrapper::vloadq(tmp_ptr + x); - auto normalized_value = wrapper::vdup_n(static_cast(0), ExactTagType{}); - if(is_log) + if (!is_log) { - normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast(sum), ExactTagType{})); + sum_inversed = T(1) / sum; } else { - normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast(sum_inversed), ExactTagType{})); + sum = static_cast(std::log(sum)); } - wrapper::vstore(out_ptr + x, normalized_value); } - /* Run remaining elements */ - for(; x < input_width; ++x) + + /* Normalize exponentials */ { - if(is_log) + /* Loop over row and compute softmax */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - out_ptr[x] = tmp_ptr[x] - sum; + auto vec_in = wrapper::vloadq(tmp_ptr + x); + auto normalized_value = wrapper::vdup_n(static_cast(0), ExactTagType{}); + if (is_log) + { + normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast(sum), ExactTagType{})); + } + else + { + normalized_value = + wrapper::vmul(vec_in, wrapper::vdup_n(static_cast(sum_inversed), ExactTagType{})); + } + wrapper::vstore(out_ptr + x, normalized_value); } - else + /* Run remaining elements */ + for (; x < input_width; ++x) { - out_ptr[x] = tmp_ptr[x] * sum_inversed; + if (is_log) + { + out_ptr[x] = tmp_ptr[x] - sum; + } + else + { + out_ptr[x] = tmp_ptr[x] * sum_inversed; + } } } - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp index a572891561..40713dc496 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_qasymm8_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_quantized(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp index 7d3fe6e046..2c5e284f54 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_qasymm8_signed_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_quantized(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &w { return neon_logits_1d_max(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp index 15a523bfc9..5e94f72faf 100644 --- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/sve/fp16.cpp @@ -23,14 +23,20 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/CpuTypes.h" #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void sve_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve_fp16_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve_softmax_logits_1d_float(in, max, tmp, out, beta, is_log, window); } @@ -39,6 +45,6 @@ void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window) { return sve_logits_1d_max(in, out, window); } -} +} // namespace cpu } // namespace arm_compute #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp index 55c4aee426..d692cc2477 100644 --- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/sve/fp32.cpp @@ -23,14 +23,20 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void sve_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve_fp32_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve_softmax_logits_1d_float(in, max, tmp, out, beta, is_log, window); } @@ -39,5 +45,5 @@ void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window) { return sve_logits_1d_max(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp index 2340a31cbd..24f1bb8143 100644 --- a/src/cpu/kernels/softmax/generic/sve/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp @@ -23,6 +23,7 @@ */ #include "src/cpu/kernels/softmax/generic/sve/impl.h" + #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" namespace arm_compute @@ -36,42 +37,48 @@ void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast(input.ptr()); + const auto out_ptr = reinterpret_cast(output.ptr()); - // Init max value - auto vec_max = wrapper::svdup_n(support::cpp11::lowest()); + // Init max value + auto vec_max = wrapper::svdup_n(support::cpp11::lowest()); - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto current_value = svld1(pg, in_ptr + x); - vec_max = svmax_m(pg, vec_max, current_value); + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto current_value = svld1(pg, in_ptr + x); + vec_max = svmax_m(pg, vec_max, current_value); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); - auto max_val = svmaxv(all_true_pg, vec_max); + auto max_val = svmaxv(all_true_pg, vec_max); - *out_ptr = max_val; - }, - input, output); + *out_ptr = max_val; + }, + input, output); } template -void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); @@ -82,88 +89,88 @@ void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *co const auto all_true_pg = wrapper::svptrue(); - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - ScalarType sum{ 0 }; - - /* Compute exponentials and sum */ + execute_window_loop( + window, + [&](const Coordinates &) { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::svdup_n(max_val); - const auto vec_beta = wrapper::svdup_n(static_cast(beta)); + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast(tmp); - /* Init sum to zero */ - auto vec_sum = wrapper::svdup_n(static_cast(0)); + ScalarType sum{0}; - /* Loop over row and compute exponentials and sum */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - do + /* Compute exponentials and sum */ { - auto vec_elements = svld1(pg, in_ptr + x); - vec_elements = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta); - if(!is_log) + /* Get max value */ + const auto max_val = *reinterpret_cast(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + const auto vec_beta = wrapper::svdup_n(static_cast(beta)); + + /* Init sum to zero */ + auto vec_sum = wrapper::svdup_n(static_cast(0)); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt(x, input_width); + do { - vec_elements = wrapper::svexp_z(pg, vec_elements); - vec_sum = svadd_m(pg, vec_sum, vec_elements); + auto vec_elements = svld1(pg, in_ptr + x); + vec_elements = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta); + if (!is_log) + { + vec_elements = wrapper::svexp_z(pg, vec_elements); + vec_sum = svadd_m(pg, vec_sum, vec_elements); + } + svst1(pg, tmp_ptr + x, vec_elements); + + if (is_log) + { + vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); + } + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, input_width); + } while (svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + sum = svaddv(all_true_pg, vec_sum); + + if (is_log) + { + sum = static_cast(std::log(sum)); } - svst1(pg, tmp_ptr + x, vec_elements); - - if(is_log) + else { - vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); + sum = ScalarType(1) / sum; } - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); } - while(svptest_any(all_true_pg, pg)); - /* Reduce sum */ - sum = svaddv(all_true_pg, vec_sum); - - if(is_log) - { - sum = static_cast(std::log(sum)); - } - else - { - sum = ScalarType(1) / sum; - } - } - - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - do + /* Normalize exponentials */ { - auto vec_in = svld1(pg, tmp_ptr + x); - auto normalized_value = wrapper::svdup_n(static_cast(0)); - if(is_log) - { - normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast(sum))); - } - else + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt(x, input_width); + do { - normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast(sum))); - } - svst1(pg, out_ptr + x, normalized_value); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); + auto vec_in = svld1(pg, tmp_ptr + x); + auto normalized_value = wrapper::svdup_n(static_cast(0)); + if (is_log) + { + normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast(sum))); + } + else + { + normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast(sum))); + } + svst1(pg, out_ptr + x, normalized_value); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, input_width); + } while (svptest_any(all_true_pg, pg)); } - while(svptest_any(all_true_pg, pg)); - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); @@ -171,9 +178,19 @@ template void sve_logits_1d_max(const ITensor *in, ITensor *out, cons template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); -template void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); -template void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); +template void sve_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window); +template void sve_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/impl.h b/src/cpu/kernels/softmax/generic/sve/impl.h index 4f76ec6a26..89a30d042f 100644 --- a/src/cpu/kernels/softmax/generic/sve/impl.h +++ b/src/cpu/kernels/softmax/generic/sve/impl.h @@ -33,8 +33,13 @@ template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); template -void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); +void sve_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp index e9044d5fc9..85e5ccfea1 100644 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute @@ -33,5 +34,5 @@ void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) { return sve_logits_1d_max(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp index ab45ce598d..4be2e2eed6 100644 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute @@ -33,5 +34,5 @@ void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &wi { return sve_logits_1d_max(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp index 8f677c62d4..98b2f5117f 100644 --- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp @@ -23,7 +23,9 @@ */ #include "src/cpu/kernels/softmax/generic/sve2/impl.h" + #include "arm_compute/core/Types.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -31,8 +33,8 @@ namespace arm_compute namespace cpu { template -void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window) +void sve2_softmax_logits_1d_quantized( + const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) { const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); @@ -50,162 +52,173 @@ void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi const int inc_2 = static_cast(2 * svcntw()); const int inc_3 = static_cast(3 * svcntw()); - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); + execute_window_loop( + window, + [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast(tmp); - float sum{}; + float sum{}; - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::svdup_n(max_val); - - /* Init sum to zero */ - auto vec_sum_0 = svdup_n_f32(0.f); - auto vec_sum_1 = svdup_n_f32(0.f); - auto vec_sum_2 = svdup_n_f32(0.f); - auto vec_sum_3 = svdup_n_f32(0.f); - - /* Loop over row and compute exponentials and sum */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - svbool_t pg_0 = svunpklo(svunpklo(pg)); - svbool_t pg_1 = svunpkhi(svunpklo(pg)); - svbool_t pg_2 = svunpklo(svunpkhi(pg)); - svbool_t pg_3 = svunpkhi(svunpkhi(pg)); - do + /* Compute exponentials and sum */ { - const auto vec_elements = svld1(pg, in_ptr + x); - const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements)); + /* Get max value */ + const auto max_val = *reinterpret_cast(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + + /* Init sum to zero */ + auto vec_sum_0 = svdup_n_f32(0.f); + auto vec_sum_1 = svdup_n_f32(0.f); + auto vec_sum_2 = svdup_n_f32(0.f); + auto vec_sum_3 = svdup_n_f32(0.f); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do + { + const auto vec_elements = svld1(pg, in_ptr + x); + const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements)); + + auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub))); + auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub))); + auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub))); + auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub))); - auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub))); - auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub))); - auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub))); - auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub))); + if (is_log) + { + vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec); + vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec); + vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec); + vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0)); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1)); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2)); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3)); + } + else + { + vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec)); + vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec)); + vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec)); + vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec)); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3); + } - if(is_log) + svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0); + svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1); + svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2); + svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } while (svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), + svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3)); + sum = svaddv_f32(all_true_pg, vec_sum); + + /* Run remaining elements */ + x = 0; + if (is_log) { - vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec); - vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec); - vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec); - vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec); - vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0)); - vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1)); - vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2)); - vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3)); + sum = std::log(sum); } else { - vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec)); - vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec)); - vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec)); - vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec)); - vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0); - vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1); - vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2); - vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3); + sum = 256.f / sum; } - - svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0); - svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1); - svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2); - svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); - pg_0 = svunpklo(svunpklo(pg)); - pg_1 = svunpkhi(svunpklo(pg)); - pg_2 = svunpklo(svunpkhi(pg)); - pg_3 = svunpkhi(svunpkhi(pg)); } - while(svptest_any(all_true_pg, pg)); - /* Reduce sum */ - const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3)); - sum = svaddv_f32(all_true_pg, vec_sum); - - /* Run remaining elements */ - x = 0; - if(is_log) - { - sum = std::log(sum); - } - else + /* Normalize exponentials */ { - sum = 256.f / sum; - } - } - - /* Normalize exponentials */ - { - constexpr bool is_qasymm8_signed = std::is_same::value; - /* Loop over row and compute softmax */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - svbool_t pg_0 = svunpklo(svunpklo(pg)); - svbool_t pg_1 = svunpkhi(svunpklo(pg)); - svbool_t pg_2 = svunpklo(svunpkhi(pg)); - svbool_t pg_3 = svunpkhi(svunpkhi(pg)); - do - { - auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x); - auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1); - auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2); - auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3); - - svfloat32_t res_0{}; - svfloat32_t res_1{}; - svfloat32_t res_2{}; - svfloat32_t res_3{}; - - if(is_log) + constexpr bool is_qasymm8_signed = std::is_same::value; + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do { - res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); - res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); - res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); - res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); - } - else - { - res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); - res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); - res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); - res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x); + auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1); + auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2); + auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3); + + svfloat32_t res_0{}; + svfloat32_t res_1{}; + svfloat32_t res_2{}; + svfloat32_t res_3{}; - if(is_qasymm8_signed) + if (is_log) { - const auto offset_vec = svdup_n_f32(128.f); - res_0 = svsub_z(pg_0, res_0, offset_vec); - res_1 = svsub_z(pg_1, res_1, offset_vec); - res_2 = svsub_z(pg_2, res_2, offset_vec); - res_3 = svsub_z(pg_3, res_3, offset_vec); + res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + } + else + { + res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + + if (is_qasymm8_signed) + { + const auto offset_vec = svdup_n_f32(128.f); + res_0 = svsub_z(pg_0, res_0, offset_vec); + res_1 = svsub_z(pg_1, res_1, offset_vec); + res_2 = svsub_z(pg_2, res_2, offset_vec); + res_3 = svsub_z(pg_3, res_3, offset_vec); + } } - } - // Store value - const auto out = convert_float_to_int(res_0, res_1, res_2, res_3); - svst1(pg, out_ptr + x, out); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); - pg_0 = svunpklo(svunpklo(pg)); - pg_1 = svunpkhi(svunpklo(pg)); - pg_2 = svunpklo(svunpkhi(pg)); - pg_3 = svunpkhi(svunpkhi(pg)); + // Store value + const auto out = convert_float_to_int(res_0, res_1, res_2, res_3); + svst1(pg, out_ptr + x, out); + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } while (svptest_any(all_true_pg, pg)); } - while(svptest_any(all_true_pg, pg)); - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } -template void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); -template void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +template void sve2_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); +template void sve2_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.h b/src/cpu/kernels/softmax/generic/sve2/impl.h index abbcc15181..33fcc26cda 100644 --- a/src/cpu/kernels/softmax/generic/sve2/impl.h +++ b/src/cpu/kernels/softmax/generic/sve2/impl.h @@ -31,8 +31,13 @@ namespace arm_compute namespace cpu { template -void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +void sve2_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute #endif /* SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H */ diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp index 810035eb9c..95623786b3 100644 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp @@ -23,16 +23,22 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve2/impl.h" namespace arm_compute { namespace cpu { -void sve2_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve2_qasymm8_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve2_softmax_logits_1d_quantized(in, max, tmp, out, beta, is_log, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp index 283b55e9ce..c20462fcef 100644 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp @@ -23,16 +23,22 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve2/impl.h" namespace arm_compute { namespace cpu { -void sve2_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve2_qasymm8_signed_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve2_softmax_logits_1d_quantized(in, max, tmp, out, beta, is_log, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h index ed3515f417..627ce0c264 100644 --- a/src/cpu/kernels/softmax/list.h +++ b/src/cpu/kernels/softmax/list.h @@ -28,9 +28,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_SOFTMAX_KERNEL(func_name) \ - void func_name(const ITensor *in, const ITensor *max, void *const tmp, \ - ITensor *out, const float beta, bool is_log, const Window &window) +#define DECLARE_SOFTMAX_KERNEL(func_name) \ + void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \ + bool is_log, const Window &window) DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax); DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax); @@ -43,8 +43,7 @@ DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax); #undef DECLARE_SOFTMAX_KERNEL -#define DECLARE_LOGITS_KERNEL(func_name) \ - void func_name(const ITensor *in, ITensor *out, const Window &window) +#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window) DECLARE_LOGITS_KERNEL(neon_fp32_logits); DECLARE_LOGITS_KERNEL(neon_fp16_logits); diff --git a/src/cpu/kernels/sub/neon/list.h b/src/cpu/kernels/sub/neon/list.h index f7e1a040bd..9f6c92271f 100644 --- a/src/cpu/kernels/sub/neon/list.h +++ b/src/cpu/kernels/sub/neon/list.h @@ -26,14 +26,16 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace cpu { -#define DECLARE_SUB_KERNEL(func_name) \ - void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +#define DECLARE_SUB_KERNEL(func_name) \ + void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \ + const Window &window) DECLARE_SUB_KERNEL(sub_qasymm8_neon_fixedpoint); DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon_fixedpoint); @@ -44,7 +46,8 @@ DECLARE_SUB_KERNEL(sub_qsymm16_neon); #undef DECLARE_SUB_KERNEL template -void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_same_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; @@ -68,7 +71,7 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); Iterator output(dst, window); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -84,41 +87,44 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const T broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + const T broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v); - if(is_broadcast_input_2) + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - res = wrapper::vmul(res, wrapper::vdup_n(static_cast(-1), ExactTagType{})); + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) + : wrapper::vsub(broadcast_value_vec, non_broadcast_v); + if (is_broadcast_input_2) + { + res = wrapper::vmul(res, wrapper::vdup_n(static_cast(-1), ExactTagType{})); + } + wrapper::vstore(output_ptr + x, res); } - wrapper::vstore(output_ptr + x, res); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v; - if(is_broadcast_input_2) + // Compute left-over elements + for (; x < window_end_x; ++x) { - res = static_cast(-1) * res; + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + auto res = + is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v; + if (is_broadcast_input_2) + { + res = static_cast(-1) * res; + } + + *(output_ptr + x) = res; } - - *(output_ptr + x) = res; - } - }, - broadcast_input, non_broadcast_input, output); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -131,31 +137,32 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto val1 = wrapper::vloadq(input1_ptr + x); - const auto val2 = wrapper::vloadq(input2_ptr + x); - const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + win, + [&](const Coordinates &) { - const auto val1 = *(input1_ptr + x); - const auto val2 = *(input2_ptr + x); - *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2; - } - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2; + } + }, + input1, input2, output); } } } // namespace cpu diff --git a/src/cpu/kernels/sub/neon/qasymm8.cpp b/src/cpu/kernels/sub/neon/qasymm8.cpp index ea6e5826dd..b750afce6e 100644 --- a/src/cpu/kernels/sub/neon/qasymm8.cpp +++ b/src/cpu/kernels/sub/neon/qasymm8.cpp @@ -23,21 +23,24 @@ */ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + #include "src/cpu/kernels/add/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void sub_qasymm8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_qasymm8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_q8_neon_fixedpoint(src0, src1, dst, policy, window, false /*is_addition*/); } -void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_qasymm8_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_qasymm8_neon(src0, src1, dst, policy, window, false /*is_addition*/); } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp index a86c7f22f6..fb0bb62682 100644 --- a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp @@ -24,21 +24,24 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + #include "src/cpu/kernels/add/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void sub_qasymm8_signed_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_qasymm8_signed_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_q8_neon_fixedpoint(src0, src1, dst, policy, window, false /*is_addition*/); } -void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_qasymm8_signed_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, false /*is_addition*/); } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/sub/neon/qsymm16.cpp b/src/cpu/kernels/sub/neon/qsymm16.cpp index 4dfdc0e78c..23e4b03843 100644 --- a/src/cpu/kernels/sub/neon/qsymm16.cpp +++ b/src/cpu/kernels/sub/neon/qsymm16.cpp @@ -25,14 +25,16 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" namespace arm_compute { namespace cpu { -void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_qsymm16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -57,7 +59,7 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -65,7 +67,7 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); // Clear X Dimension on execution window as we handle manually non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -74,61 +76,62 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const int16_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); + const int16_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); - const float32x4x2_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2), - } - }; - const float bfs = static_cast(broadcast_value) * broadcast_qinfo.scale; + const float32x4x2_t bf = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2), + }}; + const float bfs = static_cast(broadcast_value) * broadcast_qinfo.scale; - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); - const float32x4x2_t af = + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); + const float32x4x2_t af = {{ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), - } - }; + }}; - const int32x4x4_t rf = - { - { + const int32x4x4_t rf = {{ #ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) + : vsubq_f32(af.val[0], bf.val[0]), + invvscaleo)), + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) + : vsubq_f32(af.val[1], bf.val[1]), + invvscaleo)), #else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) + : vsubq_f32(af.val[0], bf.val[0]), + invvscaleo)), + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) + : vsubq_f32(af.val[1], bf.val[1]), + invvscaleo)), #endif //__aarch64__ - } - }; + }}; - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); - vst1q_s16(output_ptr + x, pa); - } + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); + vst1q_s16(output_ptr + x, pa); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info); - } - }, - broadcast_input, non_broadcast_input, output); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -140,38 +143,32 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator input2(src1, input2_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8_t a = vld1q_s16(input1_ptr + x); - const int16x8_t b = vld1q_s16(input2_ptr + x); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const float32x4x2_t af = + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int16x8_t a = vld1q_s16(input1_ptr + x); + const int16x8_t b = vld1q_s16(input2_ptr + x); + + const float32x4x2_t af = {{ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), - } - }; + }}; - const float32x4x2_t bf = - { - { + const float32x4x2_t bf = {{ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2), vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2), - } - }; + }}; - const int32x4x2_t rf = - { - { + const int32x4x2_t rf = {{ #ifdef __aarch64__ vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), @@ -179,23 +176,22 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), #endif //__aarch64__ - } - }; + }}; - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); - vst1q_s16(output_ptr + x, pa); - } + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); + vst1q_s16(output_ptr + x, pa); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast((*(input1_ptr + x))) * iq1_info.scale; - const float bfs = static_cast((*(input2_ptr + x))) * iq2_info.scale; - *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info()); - } - }, - input1, input2, output); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast((*(input1_ptr + x))) * iq1_info.scale; + const float bfs = static_cast((*(input2_ptr + x))) * iq2_info.scale; + *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); } } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp index 197e9850b9..44d70cf503 100644 --- a/src/cpu/operators/CpuActivation.cpp +++ b/src/cpu/operators/CpuActivation.cpp @@ -24,6 +24,7 @@ #include "src/cpu/operators/CpuActivation.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/IOperator.h" #include "src/common/utils/LegacySupport.h" #include "src/common/utils/Log.h" @@ -42,7 +43,8 @@ void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, con _kernel = std::move(k); } -Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info) +Status +CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info) { return kernels::CpuActivationKernel::validate(input, output, activation_info); } @@ -54,13 +56,17 @@ void CpuActivation::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); } -std::tuple CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) +std::tuple CpuContext::create_activation(const AclTensorDescriptor &src, + const AclTensorDescriptor &dst, + const AclActivationDescriptor &act, + bool is_validate) { TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst); auto info = detail::convert_to_activation_info(act); - if(is_validate && !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) + if (is_validate && + !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) { return std::make_tuple(nullptr, StatusCode::UnsupportedConfig); } @@ -69,7 +75,7 @@ std::tuple CpuContext::create_activation(const AclTenso act_op->configure(&src_info, &dst_info, info); auto op = new arm_compute::IOperator(static_cast(this)); - if(op == nullptr) + if (op == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); return std::make_tuple(nullptr, StatusCode::OutOfMemory); diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h index e21fc7d32c..ec442f92c8 100644 --- a/src/cpu/operators/CpuActivation.h +++ b/src/cpu/operators/CpuActivation.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ACTIVATION_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp index 41def8e22f..53cd7fa1b7 100644 --- a/src/cpu/operators/CpuAdd.cpp +++ b/src/cpu/operators/CpuAdd.cpp @@ -23,17 +23,20 @@ */ #include "src/cpu/operators/CpuAdd.h" -#include "src/cpu/kernels/CpuAddKernel.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/cpu/kernels/CpuAddKernel.h" namespace arm_compute { namespace cpu { -void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CpuAdd::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info); @@ -42,7 +45,11 @@ void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor _kernel = std::move(k); } -Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CpuAdd::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuAddKernel::validate(src0, src1, dst, policy); diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h index db05c100cc..5f60102de2 100644 --- a/src/cpu/operators/CpuAdd.h +++ b/src/cpu/operators/CpuAdd.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ADD_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -55,14 +56,22 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. * */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuAdd::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuAddMulAdd.cpp b/src/cpu/operators/CpuAddMulAdd.cpp index 590ee482ca..2f19f2f842 100644 --- a/src/cpu/operators/CpuAddMulAdd.cpp +++ b/src/cpu/operators/CpuAddMulAdd.cpp @@ -21,39 +21,49 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "src/cpu/operators/CpuAddMulAdd.h" + #include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/kernels/CpuAddMulAddKernel.h" -#include "src/cpu/operators/CpuAddMulAdd.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" namespace arm_compute { namespace cpu { -void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - ITensorInfo *add_output, ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CpuAddMulAdd::configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); auto k = std::make_unique(); const DataType data_type = input1->data_type(); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { _dequantize_bn_mul.configure(bn_mul, &_dequantized_bn_mul); _dequantize_bn_add.configure(bn_add, &_dequantized_bn_add); - k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, act_info); + k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, + act_info); // Save auxilary memory requirements after configuration - _aux_mem[DequantizedBnMul] = experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, _dequantized_bn_mul.total_size()); - _aux_mem[DequantizedBnAdd] = experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, _dequantized_bn_add.total_size()); + _aux_mem[DequantizedBnMul] = + experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, + _dequantized_bn_mul.total_size()); + _aux_mem[DequantizedBnAdd] = + experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, + _dequantized_bn_add.total_size()); } else { @@ -63,13 +73,17 @@ void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input _kernel = std::move(k); } -Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CpuAddMulAdd::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { const DataType data_type = input1->data_type(); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { TensorInfo dequantized_bn_mul = bn_mul->clone()->set_data_type(DataType::F32); TensorInfo dequantized_bn_add = bn_add->clone()->set_data_type(DataType::F32); @@ -77,11 +91,13 @@ Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *inpu ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_mul, &dequantized_bn_mul)); ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_add, &dequantized_bn_add)); - return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, add_output, final_output, policy, act_info); + return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, + add_output, final_output, policy, act_info); } else { - return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); + return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, + act_info); } } @@ -89,37 +105,32 @@ void CpuAddMulAdd::run(ITensorPack &tensors) { const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type(); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2); const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3); - CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, true); - CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, true); + CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, + true); + CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, + true); - ITensorPack dequantize_mul_pack = - { - { TensorType::ACL_SRC_0, bn_mul }, - { TensorType::ACL_DST_0, dequantized_bn_mul_handler.get() } - }; + ITensorPack dequantize_mul_pack = {{TensorType::ACL_SRC_0, bn_mul}, + {TensorType::ACL_DST_0, dequantized_bn_mul_handler.get()}}; - ITensorPack dequantize_add_pack = - { - { TensorType::ACL_SRC_0, bn_add }, - { TensorType::ACL_DST_0, dequantized_bn_add_handler.get() } - }; + ITensorPack dequantize_add_pack = {{TensorType::ACL_SRC_0, bn_add}, + {TensorType::ACL_DST_0, dequantized_bn_add_handler.get()}}; _dequantize_bn_mul.run(dequantize_mul_pack); _dequantize_bn_add.run(dequantize_add_pack); - ITensorPack add_mul_add_pack = - { - { TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0) }, - { TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1) }, - { TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get() }, - { TensorType::ACL_SRC_3, dequantized_bn_add_handler.get() }, - { TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0) }, - { TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1) }, + ITensorPack add_mul_add_pack = { + {TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0)}, + {TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1)}, + {TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get()}, + {TensorType::ACL_SRC_3, dequantized_bn_add_handler.get()}, + {TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0)}, + {TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1)}, }; NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), add_mul_add_pack); diff --git a/src/cpu/operators/CpuAddMulAdd.h b/src/cpu/operators/CpuAddMulAdd.h index cf1ece68f1..47db75c37e 100644 --- a/src/cpu/operators/CpuAddMulAdd.h +++ b/src/cpu/operators/CpuAddMulAdd.h @@ -42,20 +42,28 @@ public: * Similar to @ref NEAddMulAdd::configure() * */ - void configure(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - ITensorInfo *add_output, ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info); + void configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuAddMulAdd::configure() * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); // Inherited methods overridden: void run(ITensorPack &tensors) override; @@ -77,7 +85,7 @@ private: TensorInfo _dequantized_bn_mul{}; TensorInfo _dequantized_bn_add{}; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp index 1cfd8c1d0e..55b9204d71 100644 --- a/src/cpu/operators/CpuCast.cpp +++ b/src/cpu/operators/CpuCast.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuCast.h" -#include "src/cpu/kernels/CpuCastKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuCastKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp index 4021fd8ded..5f517a8fcb 100644 --- a/src/cpu/operators/CpuConcatenate.cpp +++ b/src/cpu/operators/CpuConcatenate.cpp @@ -23,21 +23,20 @@ */ #include "src/cpu/operators/CpuConcatenate.h" -#include "src/cpu/kernels/CpuConcatenateBatchKernel.h" -#include "src/cpu/kernels/CpuConcatenateDepthKernel.h" -#include "src/cpu/kernels/CpuConcatenateHeightKernel.h" -#include "src/cpu/kernels/CpuConcatenateWidthKernel.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/cpu/kernels/CpuConcatenateBatchKernel.h" +#include "src/cpu/kernels/CpuConcatenateDepthKernel.h" +#include "src/cpu/kernels/CpuConcatenateHeightKernel.h" +#include "src/cpu/kernels/CpuConcatenateWidthKernel.h" namespace arm_compute { @@ -59,9 +58,9 @@ void CpuConcatenate::configure(const std::vector &srcs_vect unsigned int offset = 0; - for(unsigned int i = 0; i < _num_srcs; ++i) + for (unsigned int i = 0; i < _num_srcs; ++i) { - switch(axis) + switch (axis) { case Window::DimX: { @@ -98,16 +97,17 @@ void CpuConcatenate::configure(const std::vector &srcs_vect } } -Status CpuConcatenate::validate(const std::vector &srcs_vector, const ITensorInfo *dst, size_t axis) +Status +CpuConcatenate::validate(const std::vector &srcs_vector, const ITensorInfo *dst, size_t axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2); unsigned int offset = 0; - for(const auto &src : srcs_vector) + for (const auto &src : srcs_vector) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - switch(axis) + switch (axis) { case Window::DimX: { @@ -135,7 +135,7 @@ Status CpuConcatenate::validate(const std::vector &srcs_vec offset += src->dimension(axis); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis); ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); @@ -146,18 +146,18 @@ Status CpuConcatenate::validate(const std::vector &srcs_vec void CpuConcatenate::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } - if(static_cast(tensors.size() - 1) != static_cast(_num_srcs)) + if (static_cast(tensors.size() - 1) != static_cast(_num_srcs)) { ARM_COMPUTE_ERROR("Configured with different number of inputs"); } int i = 0; - for(auto &k : _concat_kernels) + for (auto &k : _concat_kernels) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h index eb11926b48..c36977c70f 100644 --- a/src/cpu/operators/CpuConcatenate.h +++ b/src/cpu/operators/CpuConcatenate.h @@ -68,8 +68,8 @@ public: private: std::vector> _concat_kernels{}; - unsigned int _num_srcs{ 0 }; - unsigned int _axis{ 0 }; + unsigned int _num_srcs{0}; + unsigned int _axis{0}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp index 16ac16b3ba..19311733db 100644 --- a/src/cpu/operators/CpuConv2d.cpp +++ b/src/cpu/operators/CpuConv2d.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuConv2d.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuDirectConv2d.h" #include "src/cpu/operators/CpuGemm.h" @@ -35,26 +37,35 @@ namespace arm_compute { namespace cpu { -CpuConv2d::CpuConv2d() - : _function() +CpuConv2d::CpuConv2d() : _function() { } CpuConv2d::~CpuConv2d() = default; -void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CpuConv2d::configure(ITensorInfo *input, + ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_UNUSED(num_groups); - ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); + ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, + act_info, enable_fast_math, num_groups)); - ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) { case ConvolutionMethod::WINOGRAD: { @@ -92,19 +103,30 @@ void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITenso _aux_mem = _function->workspace(); } -Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status CpuConv2d::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon"); const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) { case ConvolutionMethod::WINOGRAD: - ARM_COMPUTE_RETURN_ON_ERROR(CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR( + CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); break; case ConvolutionMethod::GEMM: - ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, + dilation, act_info, enable_fast_math)); break; case ConvolutionMethod::GEMM_CONV2D: ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info)); @@ -120,9 +142,14 @@ Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, return Status{}; } -ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math) +ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights); ARM_COMPUTE_UNUSED(weights_info); @@ -137,35 +164,46 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co using ConvolutionConfiguration = std::tuple; using ConfigurationMethod = std::pair; - const std::vector known_configs = - { + const std::vector known_configs = { // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), + PadStrideInfo(1U, 1U, 2U, 2U)), + ConvolutionMethod::GEMM), // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionMethod::GEMM), // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM), + ConfigurationMethod( + ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), + ConvolutionMethod::GEMM), // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM) - }; + ConfigurationMethod( + ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), + ConvolutionMethod::GEMM)}; const auto find_config = [&](ConfigurationMethod c) { const ConvolutionConfiguration config = c.first; const PadStrideInfo info = std::get<3>(config); - return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride(); + return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && + std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride(); }; std::vector::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) { return (*found).second; } - if(dilation != Size2D(1U, 1U)) + if (dilation != Size2D(1U, 1U)) { return ConvolutionMethod::GEMM; } @@ -173,43 +211,49 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co { // SRGAN // Output might not be initialized when it is an internal tensor of the layer using the convolution - if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) - && (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info))) + if (input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) && + (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info))) { return ConvolutionMethod::DIRECT; } - if(input->dimension(idx_c) < 16) + if (input->dimension(idx_c) < 16) { return ConvolutionMethod::GEMM; } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // This heuristics only applies to F16 data type on A55r1 - if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16) + if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && + input->data_type() == DataType::F16) { // Exclude known bad winograd configs (and defaults to GEMM) - const std::vector known_bad_winograd_f16_with_fastmath_configs = - { + const std::vector known_bad_winograd_f16_with_fastmath_configs = { // Squeezenet_V1_1 fire2 and fire3 - ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U)), // Squeezenet_V1_1 fire6 and fire7 - ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), + PadStrideInfo(1U, 1U, 1U, 1U)), // Squeezenet_V1_1 fire8 and fire9 - ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), + PadStrideInfo(1U, 1U, 1U, 1U)), }; const auto find_conv_config = [&](ConvolutionConfiguration c) { const PadStrideInfo info = std::get<3>(c); - return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride(); + return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && + std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride(); }; - bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(), - find_conv_config) - != known_bad_winograd_f16_with_fastmath_configs.end(); - if(found_bad) + bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), + known_bad_winograd_f16_with_fastmath_configs.end(), + find_conv_config) != known_bad_winograd_f16_with_fastmath_configs.end(); + if (found_bad) { return ConvolutionMethod::GEMM; } @@ -217,16 +261,16 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // For 1x1 convolutions run the default GEMM - if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) + if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) { return ConvolutionMethod::GEMM; } - if(bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) + if (bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) { return ConvolutionMethod::WINOGRAD; } - if(bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info))) + if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info))) { return ConvolutionMethod::GEMM_CONV2D; } diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h index 0908ac0cbb..71b9e15dc1 100644 --- a/src/cpu/operators/CpuConv2d.h +++ b/src/cpu/operators/CpuConv2d.h @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -102,17 +103,32 @@ public: * available which may introduce a drop of accuracy as well. Default is false * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported */ - void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1); + void configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d * * Similar to CpuConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, - unsigned int num_groups = 1); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will return the convolution called by @ref CpuConv2d * * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], @@ -132,11 +148,17 @@ public: * * @return the Convolution Method Hint */ - static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + static ConvolutionMethod get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp index 810ffb1e4e..49e31926e3 100644 --- a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp +++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp @@ -24,6 +24,7 @@ #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h" @@ -31,7 +32,10 @@ namespace arm_compute { namespace cpu { -void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout); auto k = std::make_unique(); @@ -39,7 +43,10 @@ void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorI _kernel = std::move(k); } -Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); } @@ -48,5 +55,5 @@ void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors) { NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h index ea70eee134..e208cca3a0 100644 --- a/src/cpu/operators/CpuConvertFullyConnectedWeights.h +++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h @@ -41,14 +41,18 @@ public: * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + void + configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuConvertFullyConnectedWeights::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); // Inherited methods overridden: void run(ITensorPack &tensors) override; }; diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp index 7420ff6240..92c19d4df2 100644 --- a/src/cpu/operators/CpuCopy.cpp +++ b/src/cpu/operators/CpuCopy.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuCopy.h" -#include "src/cpu/kernels/CpuCopyKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuCopyKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp index 884fe5c4ed..54075f2afa 100644 --- a/src/cpu/operators/CpuDepthwiseConv2d.cpp +++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp @@ -24,10 +24,11 @@ #include "src/cpu/operators/CpuDepthwiseConv2d.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" @@ -37,11 +38,16 @@ namespace cpu { namespace { -Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +Status validate_arguments_optimized(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - if(!is_data_type_quantized_per_channel(weights->data_type())) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + if (!is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); } @@ -49,14 +55,17 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1); const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() + - info.pad_stride_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() + - info.pad_stride_info.pad_bottom()); - - if(biases != nullptr) + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > + src->dimension(idx_w) + info.pad_stride_info.pad_left() + + info.pad_stride_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > + src->dimension(idx_h) + info.pad_stride_info.pad_top() + + info.pad_stride_info.pad_bottom()); + + if (biases != nullptr) { - const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int channel_idx = + get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); } @@ -64,7 +73,7 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info)); // Validate Activation Layer - if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) + if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); } @@ -80,8 +89,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, - dst, info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info)); _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); _has_bias = biases != nullptr; @@ -91,10 +100,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI _are_weights_const = weights->are_values_constant(); // Configure pipeline - _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info); + _is_activationlayer_enabled = + info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info); _dwc_optimized_func = std::make_unique(); - if(_is_nchw) + if (_is_nchw) { _permute_input = std::make_unique(); _permute_weights = std::make_unique(); @@ -128,7 +138,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI } // Configure activation - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique(); _activationlayer_function->configure(dst, nullptr, info.act_info); @@ -155,7 +165,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); // Permute input - if(_permute) + if (_permute) { ITensorPack pack; auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); @@ -166,7 +176,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t } // Run assembly function - if(_is_nchw) + if (_is_nchw) { auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); @@ -198,7 +208,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t } // Permute output - if(_is_nchw) + if (_is_nchw) { ITensorPack pack; auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); @@ -208,7 +218,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t } // Run activation - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); @@ -221,7 +231,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac { // if weights are not constant then we need to repack so that weights // can be updated in-place - if(!_are_weights_const) + if (!_are_weights_const) { auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); @@ -238,14 +248,14 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac return; } - if(!_is_prepared) + if (!_is_prepared) { auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); // Permute weights - if(_permute) + if (_permute) { auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1); @@ -279,11 +289,15 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac } } -void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, - dst, info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info)); _is_nchw = src->data_layout() == DataLayout::NCHW; _is_prepared = !_is_nchw; @@ -294,9 +308,10 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, auto input_perm = std::make_unique(); auto weights_perm = std::make_unique(); - auto output_perm = std::make_unique(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); + auto output_perm = std::make_unique( + dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); - if(_is_nchw) + if (_is_nchw) { _permute_input = std::make_unique(); _permute_weights = std::make_unique(); @@ -315,7 +330,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, _depthwise_conv_kernel = std::make_unique(); _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info); - if(_is_nchw) + if (_is_nchw) { _permute_output = std::make_unique(); _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); @@ -324,43 +339,61 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, //Configure Activation Layer _is_activationlayer_enabled = info.act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique(); _activationlayer_function->configure(dst, nullptr, info.act_info); } } -Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, +Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - if(src->data_layout() == DataLayout::NCHW) + if (src->data_layout() == DataLayout::NCHW) { TensorShape permuted_input_shape = src->tensor_shape(); TensorShape permuted_weights_shape = weights->tensor_shape(); - TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + TensorShape permuted_output_shape = + misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - const TensorInfo permuted_input = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_output = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW)); + const TensorInfo permuted_input = TensorInfo(src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_input_shape) + .set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_weights = TensorInfo(weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_weights_shape) + .set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_output = TensorInfo(dst->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_output_shape) + .set_data_layout(DataLayout::NCHW)); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate( + &permuted_input, &permuted_weights, biases, &permuted_output, info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info)); + ARM_COMPUTE_RETURN_ON_ERROR( + cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info)); } // Validate Activation Layer - if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) + if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); } @@ -375,7 +408,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - if(_is_nchw) + if (_is_nchw) { prepare(tensors); auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); @@ -392,7 +425,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm); pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); pack_depth.add_tensor(TensorType::ACL_DST, dst_perm); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), + pack_depth); } else { @@ -401,10 +435,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) pack_depth.add_tensor(TensorType::ACL_SRC_1, weights); pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); pack_depth.add_tensor(TensorType::ACL_DST, dst); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), + pack_depth); } - if(_is_nchw) + if (_is_nchw) { ITensorPack pack; auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); @@ -413,7 +448,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) _permute_output->run(pack); } - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); @@ -424,7 +459,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); @@ -441,12 +476,17 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors } } -void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) +void CpuDepthwiseConv2d::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info); - _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info); - switch(_depth_conv_func) + _depth_conv_func = + get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info); + switch (_depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _func_optimized.configure(src, weights, biases, dst, info); @@ -459,10 +499,14 @@ void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, } } -Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) { DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info); - switch(depth_conv_func) + switch (depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info); @@ -475,10 +519,13 @@ Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *w } } -DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, +DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, const ConvolutionInfo &info) { - if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info))) + if (bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info))) { return DepthwiseConvolutionFunction::OPTIMIZED; } @@ -490,7 +537,7 @@ DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_functi void CpuDepthwiseConv2d::run(ITensorPack &tensors) { - switch(_depth_conv_func) + switch (_depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _func_optimized.run(tensors); @@ -505,7 +552,7 @@ void CpuDepthwiseConv2d::run(ITensorPack &tensors) void CpuDepthwiseConv2d::prepare(ITensorPack &tensors) { - switch(_depth_conv_func) + switch (_depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _func_optimized.prepare(tensors); diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h index 3d8719ee44..7eaa0df857 100644 --- a/src/cpu/operators/CpuDepthwiseConv2d.h +++ b/src/cpu/operators/CpuDepthwiseConv2d.h @@ -24,8 +24,9 @@ #ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H -#include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorInfo.h" + #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" @@ -56,14 +57,22 @@ public: * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. * @param[in] info Depthwise convolution meta-data. */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDepthwiseConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d * * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 @@ -76,7 +85,10 @@ public: * * @return a Depthwise Convolution Function */ - static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, const ConvolutionInfo &info); // Inherited methods overriden: @@ -118,32 +130,40 @@ private: * @param[out] dst Destination tensor info. Data type supported: same as @p src. * @param[in] info Depthwise convolution meta-data. */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDepthwiseConv2dOptimizedInternal::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); // Inherited methods overriden: void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; private: - std::unique_ptr _dwc_optimized_func{ nullptr }; - std::unique_ptr _permute_input{ nullptr }; - std::unique_ptr _permute_weights{ nullptr }; - std::unique_ptr _permute_output{ nullptr }; - std::unique_ptr _activationlayer_function{ nullptr }; - bool _has_bias{ false }; - bool _is_quantized{ false }; - bool _is_nchw{ true }; - bool _permute{ false }; - bool _is_activationlayer_enabled{ false }; - bool _is_prepared{ false }; - bool _are_weights_const{ true }; + std::unique_ptr _dwc_optimized_func{nullptr}; + std::unique_ptr _permute_input{nullptr}; + std::unique_ptr _permute_weights{nullptr}; + std::unique_ptr _permute_output{nullptr}; + std::unique_ptr _activationlayer_function{nullptr}; + bool _has_bias{false}; + bool _is_quantized{false}; + bool _is_nchw{true}; + bool _permute{false}; + bool _is_activationlayer_enabled{false}; + bool _is_prepared{false}; + bool _are_weights_const{true}; }; /** Basic function to execute a generic depthwise convolution. This function calls the following kernel: @@ -176,7 +196,11 @@ private: * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. * @param[in] info Depthwise convolution meta-data. */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * @@ -184,24 +208,28 @@ private: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); // Inherited methods overridden: void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; private: - std::unique_ptr _depthwise_conv_kernel{ nullptr }; - std::unique_ptr _permute_input{ nullptr }; - std::unique_ptr _permute_weights{ nullptr }; - std::unique_ptr _permute_output{ nullptr }; - std::unique_ptr _activationlayer_function{ nullptr }; - bool _is_nchw{ true }; - bool _is_prepared{ false }; - bool _is_activationlayer_enabled{ false }; + std::unique_ptr _depthwise_conv_kernel{nullptr}; + std::unique_ptr _permute_input{nullptr}; + std::unique_ptr _permute_weights{nullptr}; + std::unique_ptr _permute_output{nullptr}; + std::unique_ptr _activationlayer_function{nullptr}; + bool _is_nchw{true}; + bool _is_prepared{false}; + bool _is_activationlayer_enabled{false}; }; - DepthwiseConvolutionFunction _depth_conv_func{ DepthwiseConvolutionFunction::GENERIC }; + DepthwiseConvolutionFunction _depth_conv_func{DepthwiseConvolutionFunction::GENERIC}; CpuDepthwiseConv2dOptimizedInternal _func_optimized{}; CpuDepthwiseConv2dGeneric _func_generic{}; }; diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp index d078155155..8d3741de96 100644 --- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -38,15 +39,14 @@ namespace cpu { struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl { - std::unique_ptr asm_kernel{ nullptr }; - bool is_prepared{ false }; - bool are_weights_const{ true }; + std::unique_ptr asm_kernel{nullptr}; + bool is_prepared{false}; + bool are_weights_const{true}; experimental::MemoryRequirements mem_req{}; }; #ifndef DOXYGEN_SKIP_THIS -CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() - : _pImpl(std::make_unique()) +CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() : _pImpl(std::make_unique()) { } #endif /* DOXYGEN_SKIP_THIS */ @@ -66,7 +66,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, _pImpl->are_weights_const = weights->are_values_constant(); // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info)) + if (!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info)) { return; } @@ -77,12 +77,16 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, // Compute memory requirements for assembly kernels constexpr size_t alignment = 4096; - _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment }); - _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment }); + _pImpl->mem_req.push_back({TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment}); + _pImpl->mem_req.push_back({TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment}); _pImpl->asm_kernel = std::move(dwc_wrapper); } -Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info) +Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info) { return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info); } @@ -111,7 +115,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) { const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - if((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared) + if ((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared) { // Pack weights and bias const ITensor *bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); @@ -125,11 +129,12 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) const auto weights_padding = weights->info()->padding(); const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right; - const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom); + const size_t ld_weights_row = + ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom); _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row); weights->mark_as_unused(); - if(bias != nullptr) + if (bias != nullptr) { bias->mark_as_unused(); } diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h index f222ab9cf9..f1816625d2 100644 --- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -53,14 +54,22 @@ public: * @param[out] dst Destination tensor info. Data type supported: same as @p src. * @param[in] info Depthwise convolution meta-data. */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info); /** Checks if activation is supported by the assembly kernels * * @param[in] activation Activation to check @@ -70,8 +79,8 @@ public: static bool is_activation_supported(const ActivationLayerInfo &activation); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp index 12dc136ba3..c05a23f3a7 100644 --- a/src/cpu/operators/CpuDequantize.cpp +++ b/src/cpu/operators/CpuDequantize.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuDequantizeKernel.h" diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp index 9cdbdb61c1..135a3bb2b9 100644 --- a/src/cpu/operators/CpuDirectConv2d.cpp +++ b/src/cpu/operators/CpuDirectConv2d.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -36,12 +37,25 @@ namespace cpu CpuDirectConv2d::~CpuDirectConv2d() = default; CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), - _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required() + : _memory_group(std::move(memory_manager)), + _output_stage_kernel(), + _conv_kernel(), + _input_border_handler(), + _activationlayer_function(), + _accumulator(), + _has_bias(false), + _is_activationlayer_enabled(false), + _dim_split(Window::DimZ), + _is_padding_required() { } -void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CpuDirectConv2d::configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, conv_info, act_info); @@ -51,7 +65,7 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT _input_border_handler = std::make_unique(); // Free accumulator - if(_accumulator.buffer() != nullptr) + if (_accumulator.buffer() != nullptr) { _accumulator.allocator()->free(); } @@ -62,28 +76,33 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT _has_bias = (bias != nullptr); _conv_kernel->configure(src, weights, dst, conv_info); - if(_has_bias) + if (_has_bias) { _output_stage_kernel->configure(dst, bias); } _is_padding_required = !_conv_kernel->border_size().empty(); - if(_is_padding_required) + if (_is_padding_required) { // Add zero padding XY - _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0.f))); + _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, + PixelValue(static_cast(0.f))); } //Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique(); _activationlayer_function->configure(dst, dst, act_info); } } -Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, +Status CpuDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); @@ -95,7 +114,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig // Validate Convolution kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info)); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), @@ -106,7 +125,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig // Validate bias kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst)); - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info)); } @@ -122,14 +141,15 @@ void CpuDirectConv2d::run(ITensorPack &tensors) auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto dst = tensors.get_tensor(TensorType::ACL_DST); - if(_is_padding_required) + if (_is_padding_required) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_DST, src); - NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack); + NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), + pack); } NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); - if(_has_bias) + if (_has_bias) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_0, dst); @@ -138,7 +158,7 @@ void CpuDirectConv2d::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack); } - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h index fa8d61e083..73c85f2dcd 100644 --- a/src/cpu/operators/CpuDirectConv2d.h +++ b/src/cpu/operators/CpuDirectConv2d.h @@ -24,13 +24,14 @@ #ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H #define ARM_COMPUTE_CPU_DIRECTCONV2D_H +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" @@ -75,14 +76,23 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDirectConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: @@ -95,10 +105,10 @@ private: std::unique_ptr _input_border_handler; std::unique_ptr _activationlayer_function; Tensor _accumulator; - bool _has_bias{ false }; - bool _is_activationlayer_enabled{ false }; - unsigned int _dim_split{ 0 }; - bool _is_padding_required{ false }; + bool _has_bias{false}; + bool _is_activationlayer_enabled{false}; + unsigned int _dim_split{0}; + bool _is_padding_required{false}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuDirectConv3d.cpp b/src/cpu/operators/CpuDirectConv3d.cpp index aa74e420a6..626f1c6775 100644 --- a/src/cpu/operators/CpuDirectConv3d.cpp +++ b/src/cpu/operators/CpuDirectConv3d.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -36,11 +37,17 @@ namespace cpu CpuDirectConv3d::~CpuDirectConv3d() = default; CpuDirectConv3d::CpuDirectConv3d(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _conv_kernel(), _activationlayer_function(), _accumulator(), _is_activationlayer_enabled(false), _dim_split(Window::DimZ) + : _memory_group(std::move(memory_manager)), + _conv_kernel(), + _activationlayer_function(), + _accumulator(), + _is_activationlayer_enabled(false), + _dim_split(Window::DimZ) { } -void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info) +void CpuDirectConv3d::configure( + ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info) { ARM_COMPUTE_LOG_PARAMS(src0, src1, src2, dst, conv_info); ARM_COMPUTE_ERROR_ON(src0->data_layout() != DataLayout::NDHWC); @@ -48,7 +55,7 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen _conv_kernel = std::make_unique(); // Free accumulator - if(_accumulator.buffer() != nullptr) + if (_accumulator.buffer() != nullptr) { _accumulator.allocator()->free(); } @@ -59,21 +66,25 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen //Configure Activation Layer _is_activationlayer_enabled = conv_info.act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique(); _activationlayer_function->configure(dst, dst, conv_info.act_info); } } -Status CpuDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info) +Status CpuDirectConv3d::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo conv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); // Validate Convolution kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv3dKernel::validate(src0, src1, src2, dst, conv_info)); - if(conv_info.act_info.enabled()) + if (conv_info.act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, conv_info.act_info)); } @@ -89,7 +100,7 @@ void CpuDirectConv3d::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); @@ -98,4 +109,4 @@ void CpuDirectConv3d::run(ITensorPack &tensors) } } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h index cde01f07c2..3ad1e09a14 100644 --- a/src/cpu/operators/CpuDirectConv3d.h +++ b/src/cpu/operators/CpuDirectConv3d.h @@ -24,14 +24,15 @@ #ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H #define ARM_COMPUTE_CPU_DIRECTCONV3D_H +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" @@ -76,14 +77,19 @@ public: * The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor. * @param[in] conv_info Contains padding, stride, acitvation information. */ - void configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info); + void configure( + ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDirectConv3d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo conv_info); // Inherited methods overridden: void run(ITensorPack &tensors) override; @@ -93,8 +99,8 @@ private: std::unique_ptr _conv_kernel; std::unique_ptr _activationlayer_function; Tensor _accumulator; - bool _is_activationlayer_enabled{ false }; - unsigned int _dim_split{ 0 }; + bool _is_activationlayer_enabled{false}; + unsigned int _dim_split{0}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp index b88ae3e514..c2ae8773c6 100644 --- a/src/cpu/operators/CpuElementwise.cpp +++ b/src/cpu/operators/CpuElementwise.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuElementwise.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/CpuElementwiseKernel.h" @@ -33,7 +34,7 @@ namespace cpu void CpuElementwiseBase::run(ITensorPack &tensors) { // If the kernel has been configured, use the window from the kernel. - if(_kernel->is_window_configured()) + if (_kernel->is_window_configured()) { ICpuOperator::run(tensors); return; @@ -101,12 +102,16 @@ void CpuElementwiseComparisonStatic::configure(const ITensorInfo *src0, con } template -Status CpuElementwiseComparisonStatic::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +Status +CpuElementwiseComparisonStatic::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst); } -void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op) +void CpuElementwiseComparison::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ComparisonOperation op) { ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); auto k = std::make_unique(); @@ -114,7 +119,10 @@ void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorI _kernel = std::move(k); } -Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op) +Status CpuElementwiseComparison::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ComparisonOperation op) { return kernels::CpuComparisonKernel::validate(op, src0, src1, dst); } @@ -127,4 +135,4 @@ template class CpuElementwiseComparisonStatic template class CpuElementwiseComparisonStatic; template class CpuElementwiseComparisonStatic; } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h index b6c61cf245..5db53c8026 100644 --- a/src/cpu/operators/CpuElementwise.h +++ b/src/cpu/operators/CpuElementwise.h @@ -139,7 +139,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op); + static Status + validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op); }; /** Basic function to run @ref cpu::kernels::CpuComparisonKernel @@ -182,4 +183,4 @@ using NELessEqual = CpuElementwiseComparisonStaticis_window_configured()) + if (_kernel->is_window_configured()) { ICpuOperator::run(tensors); return; @@ -57,4 +58,4 @@ void CpuElementwiseUnary::run(ITensorPack &tensors) ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second); } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h index 5e8e98d047..1e51bfaa1c 100644 --- a/src/cpu/operators/CpuElementwiseUnary.h +++ b/src/cpu/operators/CpuElementwiseUnary.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H #include "arm_compute/core/Types.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -56,4 +57,4 @@ public: } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */ diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp index 3d8f62fe07..1890d0b916 100644 --- a/src/cpu/operators/CpuFill.cpp +++ b/src/cpu/operators/CpuFill.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuFill.h" -#include "src/cpu/kernels/CpuFillKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuFillKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h index 41d9a9fa8a..cb83745d29 100644 --- a/src/cpu/operators/CpuFill.h +++ b/src/cpu/operators/CpuFill.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_FILL_H #include "arm_compute/core/PixelValue.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp index 7bab9e481c..2609d44590 100644 --- a/src/cpu/operators/CpuFlatten.cpp +++ b/src/cpu/operators/CpuFlatten.cpp @@ -23,16 +23,14 @@ */ #include "src/cpu/operators/CpuFlatten.h" -#include "src/cpu/operators/CpuReshape.h" - #include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuReshape.h" namespace arm_compute { namespace cpu { -CpuFlatten::CpuFlatten() - : _reshape(nullptr) +CpuFlatten::CpuFlatten() : _reshape(nullptr) { } diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp index 868add7d29..a107393b01 100644 --- a/src/cpu/operators/CpuFloor.cpp +++ b/src/cpu/operators/CpuFloor.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuFloor.h" -#include "src/cpu/kernels/CpuFloorKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuFloorKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp index 395d8d2aa5..85a0b0311b 100644 --- a/src/cpu/operators/CpuFullyConnected.cpp +++ b/src/cpu/operators/CpuFullyConnected.cpp @@ -25,10 +25,11 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/MemoryHelpers.h" @@ -49,8 +50,11 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act, - GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) +Status get_gemmlowp_output_stage_info(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) { const auto data_type = src->data_type(); const QuantizationInfo oq_info = dst->quantization_info(); @@ -62,10 +66,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo int32_t output_multiplier; int32_t output_shift; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - int32_t type_min = 0; - int32_t type_max = 0; + int32_t type_min = 0; + int32_t type_max = 0; std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type); gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; @@ -78,14 +83,22 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo return Status{}; } -Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act, bool enable_fast_math, WeightFormat weight_format) +Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + bool enable_fast_math, + WeightFormat weight_format) { - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate src and weights offset - const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset); - const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset); + const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, + -src->quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, + -weights->quantization_info().uniform().offset); GEMMLowpOutputStageInfo gemmlowp_output_stage_info; ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info)); @@ -97,11 +110,8 @@ Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITe // Validate gemmlowp function TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); - ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info, - &weights_info, - biases, - dst, - gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CpuGemmLowpMatrixMultiplyCore::validate(&src_info, &weights_info, biases, dst, gemm_info)); } else { @@ -142,21 +152,28 @@ CpuFullyConnected::CpuFullyConnected() CpuFullyConnected::~CpuFullyConnected() = default; -void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) +void CpuFullyConnected::configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) { - if(_is_quantized_asymmetric) + if (_is_quantized_asymmetric) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate src and weights offset - const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset); - const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset); + const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, + -src->quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, + -weights->quantization_info().uniform().offset); TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); // Configure gemmlowp function and output stage for asymmetric quantized types GEMMLowpOutputStageInfo gemmlowp_output_stage_info; - const Status status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info); + const Status status = + get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info); ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK); GEMMInfo gemm_info; @@ -179,7 +196,11 @@ void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo * } } -void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) +void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) { ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); @@ -195,7 +216,11 @@ void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorI configure_mm(&_flattened_src, weights, biases, dst, act); } -void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) +void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) { ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1)); @@ -203,17 +228,17 @@ void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInf configure_mm(src, weights, biases, dst, act); } -void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +void CpuFullyConnected::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src, - weights, - biases != nullptr ? biases : nullptr, - dst, - fc_info, - weights_info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuFullyConnected::validate(src, weights, biases != nullptr ? biases : nullptr, dst, fc_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info); _needs_weights_conversion = false; @@ -238,9 +263,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei // Check if we have a fully connected layer with batches const bool is_batched_fc_layer = dst->dimension(1) > 1; - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { @@ -248,7 +275,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei } // Reshape weights if needed - if(_needs_weights_reshape) + if (_needs_weights_reshape) { // Reshape the weights _transpose_weights = std::make_unique(); @@ -260,13 +287,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei } // Convert weights if needed - if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Convert weights _convert_weights = std::make_unique(); - _convert_weights->configure(weights_to_use, - &_converted_weights, - src->tensor_shape(), + _convert_weights->configure(weights_to_use, &_converted_weights, src->tensor_shape(), fc_info.weights_trained_layout); _converted_weights.set_are_values_constant(weights_to_use->are_values_constant()); @@ -275,7 +300,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei _trans_weights_idx = AuxTensorIdx::ConvertedWeights; } - if(_is_fc_after_conv) + if (_is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info); @@ -287,54 +312,57 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei } // Retain the tensorinfo with the weights to use - if(_needs_weights_reshape || _needs_weights_conversion) + if (_needs_weights_reshape || _needs_weights_conversion) { _trans_weights = *weights_to_use; } // Set auxiliary memory requirements auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); - for(unsigned int i = 0; i < gemm_mem_req.size(); ++i) + for (unsigned int i = 0; i < gemm_mem_req.size(); ++i) { _aux_mem[i] = gemm_mem_req[i]; } - if(_aux_mem[Pretranspose].size > 0) + if (_aux_mem[Pretranspose].size > 0) { // Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch // Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time. _aux_mem[TransposedWeights] = MemoryInfo( offset_int_vec(TransposedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : - (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent : - MemoryLifetime::Prepare, + _dynamic_weights ? MemoryLifetime::Temporary + : (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare, _reshaped_weights.total_size()); - _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, - _converted_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _converted_weights.total_size()); } else { - _aux_mem[TransposedWeights] = MemoryInfo( - offset_int_vec(TransposedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : - _needs_weights_conversion ? MemoryLifetime::Prepare : - MemoryLifetime::Persistent, - _reshaped_weights.total_size()); + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), + _dynamic_weights ? MemoryLifetime::Temporary + : _needs_weights_conversion ? MemoryLifetime::Prepare + : MemoryLifetime::Persistent, + _reshaped_weights.total_size()); _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent, + offset_int_vec(ConvertedWeights), _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent, _converted_weights.total_size()); } - _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); + _aux_mem[FlattenedSrc] = + MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); } -Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, - const ITensorInfo *biases, const ITensorInfo *dst, FullyConnectedLayerInfo fc_info, WeightsInfo weights_info) +Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + WeightsInfo weights_info) { GEMMInfo gemm_info; gemm_info.set_activation_info(fc_info.activation_info); @@ -345,12 +373,17 @@ Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weigh return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info); } -Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +Status CpuFullyConnected::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); if (is_fixed_format_fast_math(weights_info.weight_format())) { @@ -364,15 +397,22 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we } ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU - && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON( + fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; bool is_fc_after_conv = true; - const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src))); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); + const ITensorInfo &flatten_src = + TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src))); + const ITensorInfo &reshaped_weights = TensorInfo( + weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = weights_reshaped + ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -386,10 +426,10 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we // Check if we have a fully connected layer with batches const bool is_batched_fc_layer = dst->dimension(1) > 1; - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -399,36 +439,37 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we } } - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1)); + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { is_fc_after_conv = src->num_dimensions() > 1; } - if(!weights_reshaped) + if (!weights_reshaped) { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } - if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - src->tensor_shape(), - fc_info.weights_trained_layout)); + ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } - if(is_fc_after_conv) + if (is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); // Validate flatten kernel ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src)); @@ -440,7 +481,8 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1)); } // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, fc_info.enable_fast_math, weights_info.weight_format())); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, + fc_info.enable_fast_math, weights_info.weight_format())); return Status{}; } @@ -460,21 +502,21 @@ void CpuFullyConnected::run(ITensorPack &tensors) CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false); // Linearize src if it comes from a convolutional layer - if(_is_fc_after_conv) + if (_is_fc_after_conv) { - ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } }; + ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}}; _flatten->run(flatten_pack); } ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); - if(_needs_weights_reshape || _needs_weights_conversion) + if (_needs_weights_reshape || _needs_weights_conversion) { gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get()); } // Run matrix multiply - if(_is_quantized_asymmetric) + if (_is_quantized_asymmetric) { _mm_gemmlowp->run(gemm_pack); } @@ -486,7 +528,7 @@ void CpuFullyConnected::run(ITensorPack &tensors) void CpuFullyConnected::prepare(ITensorPack &tensors) { - if(!_is_prepared || _dynamic_weights) + if (!_is_prepared || _dynamic_weights) { #ifdef ARM_COMPUTE_ASSERTS_ENABLED ++_asrt_prepare_count; @@ -502,20 +544,21 @@ void CpuFullyConnected::prepare(ITensorPack &tensors) const ITensor *cur_weights = weights; // Reshape of the weights (happens only once) - if(_needs_weights_reshape) + if (_needs_weights_reshape) { // Run reshape weights kernel and mark weights as unused - ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } }; - NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack); + ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}}; + NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), + transpose_pack); cur_weights->mark_as_unused(); cur_weights = reshaped_weights.get(); } // Convert weights if needed (happens only once) - if(_needs_weights_conversion) + if (_needs_weights_conversion) { - ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } }; + ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}}; _convert_weights->run(convert_pack); cur_weights->mark_as_unused(); @@ -526,7 +569,7 @@ void CpuFullyConnected::prepare(ITensorPack &tensors) gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights); // Prepare GEMM prepare and release unused weights - if(!_is_quantized_asymmetric) + if (!_is_quantized_asymmetric) { _mm_gemm->prepare(gemm_pack); } diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h index 1e8c6478d0..7073fb9f7c 100644 --- a/src/cpu/operators/CpuFullyConnected.h +++ b/src/cpu/operators/CpuFullyConnected.h @@ -24,11 +24,11 @@ #ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H #define ARM_COMPUTE_CPU_FULLY_CONNECTED_H -#include "src/cpu/ICpuOperator.h" - #include "arm_compute/core/TensorInfo.h" #include "arm_compute/function_info/FullyConnectedLayerInfo.h" +#include "src/cpu/ICpuOperator.h" + #include namespace arm_compute @@ -86,16 +86,24 @@ public: * @param[in] fc_info (Optional) Fully connected layer additional info * @param[in] weights_info (Optional) Stores neccessary compute information when weights are already reshaped */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo()); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected * * Similar to @ref CpuFullyConnected::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo()); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same @@ -103,19 +111,35 @@ public: * * @return a status */ - static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, - const ITensorInfo *biases, const ITensorInfo *dst, - FullyConnectedLayerInfo fc_info, WeightsInfo weights_info); + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + WeightsInfo weights_info); //Inherited methods override - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: - void configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); - void configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); - void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); + void configure_fc_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + void configure_conv_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + void configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); enum AuxTensorIdx { diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp index 34b845928d..8da166dbef 100644 --- a/src/cpu/operators/CpuGemm.cpp +++ b/src/cpu/operators/CpuGemm.cpp @@ -24,9 +24,10 @@ #include "src/cpu/operators/CpuGemm.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -57,17 +58,25 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) } } // namespace -void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info) +void CpuGemm::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info)); ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info); - const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool is_c_bias = beta == 1 && c != nullptr; - bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) && - (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. - !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. + const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + const bool is_c_bias = beta == 1 && c != nullptr; + bool run_optimised = + bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) && + (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. + !(!b->are_values_constant() && + b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. // Check if we need to reshape the matrix B only on the first run _is_prepared = false; @@ -76,9 +85,12 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso _run_alpha_scale = alpha != 1.f; _run_bias_addition = is_c_bias; _run_addition = beta != 0 && beta != 1 && c != nullptr; - _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); + _run_activation = + gemm_info.activation_info().enabled() && + (!run_optimised || + (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); - if(run_optimised) + if (run_optimised) { const ITensorInfo *c_to_use = is_c_bias ? c : nullptr; _asm_glue = std::make_unique(); @@ -90,10 +102,11 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso _aux_mem[Pretraspose] = asm_mem_req[Pretraspose]; // Scale product by alpha - if(_run_alpha_scale) + if (_run_alpha_scale) { _alpha_scale_func = std::make_unique(); - _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); + _alpha_scale_func->configure( + d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); } } else @@ -104,7 +117,7 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso _mm_kernel = std::make_unique(); // Select between GEMV and GEMM - if(_run_vector_matrix_multiplication) + if (_run_vector_matrix_multiplication) { // Configure the matrix multiply kernel _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false); @@ -118,41 +131,50 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso // Configure interleave kernel _interleave_kernel = std::make_unique(); _interleave_kernel->configure(a, &_tmp_a); - _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size()); + _aux_mem[InterleavedLHS] = + MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size()); // Configure transpose kernel _transpose_kernel = std::make_unique(); _transpose_kernel->configure(b, &_tmp_b); - _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); + _aux_mem[TransposedRHS] = + MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); // Configure matrix multiplication kernel _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k)); } - if(_run_bias_addition) + if (_run_bias_addition) { _add_bias = std::make_unique(); _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); - _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size()); + _aux_mem[TempResult] = + MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size()); } } // Configure matrix addition kernel - if(_run_addition) + if (_run_addition) { _ma_kernel = std::make_unique(); _ma_kernel->configure(c, d, beta); } // Configure activation - if(_run_activation) + if (_run_activation) { _activation_func = std::make_unique(); _activation_func->configure(d, nullptr, gemm_info.activation_info()); } } -Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info) +Status CpuGemm::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); const bool is_c_bias = beta == 1 && c != nullptr; @@ -162,7 +184,7 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32); - if(is_fixed_format_fast_math(gemm_info.weight_format())) + if (is_fixed_format_fast_math(gemm_info.weight_format())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16); @@ -174,46 +196,54 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens const int block_by = arm_compute::block_by(gemm_info.weight_format()); // test if im2col has changed the dimensions that are needed for padding - if(a->dimension(0) != b->dimension(1) && block_by > 1) + if (a->dimension(0) != b->dimension(1) && block_by > 1) { // have to verify bias const size_t dim0_sz = a->dimension(0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz % block_by) != 0, ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (dim0_sz % block_by) != 0, + ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str()); // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right // b->dimension(1) = kernel_area * input_channel // a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by; const size_t kernel_area = (dim0_sz - b->dimension(1)) / input_pad_right; - ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz - kernel_area * input_pad_right) != b->dimension(1), "The product AB is defined only if A number of columns and B number of rows are related"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (dim0_sz - kernel_area * input_pad_right) != b->dimension(1), + "The product AB is defined only if A number of columns and B number of rows are related"); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + a->dimension(0) != b->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); } ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - if(a->data_type() != DataType::BFLOAT16) + if (a->data_type() != DataType::BFLOAT16) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d); } - if(run_addition) + if (run_addition) { ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0); ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), + "The C matrix must have the same number of rows as the matrix A"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), + "The C matrix must have the same number of columns as the matrix B"); } - if(d->total_size() != 0) + if (d->total_size() != 0) { // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more. ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0)); - if(gemm_info.depth_output_gemm3d() != 0) + if (gemm_info.depth_output_gemm3d() != 0) { - if(gemm_info.reinterpret_input_as_3d()) + if (gemm_info.reinterpret_input_as_3d()) { ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1)); ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2)); @@ -230,15 +260,19 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens } // Check if we need to run the optimized assembly kernel - cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) && - (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. - !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. - - if(!run_optimised) + cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + const bool run_optimised = + bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) && + (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. + !(!b->are_values_constant() && + b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. + + if (!run_optimised) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), + "CpuGemm cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, + "CpuGemm cannot reinterpret the output tensor as 3D"); // Check if the first input tensor is a vector. const bool run_vector_matrix_multiplication = a->dimension(1) < 2; @@ -254,7 +288,8 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens int mult_transpose1xW_width = 1; int mult_interleave4x4_height = 1; - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); + const GEMMReshapeInfo reshape_info = GEMMReshapeInfo( + m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); const ITensorInfo *matrix_a_info = a; const ITensorInfo *matrix_b_info = b; @@ -263,39 +298,44 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens TensorInfo tmp_b_info{}; TensorInfo tmp_output_info = *d->clone(); - if(run_interleave_transpose) + if (run_interleave_transpose) { matrix_a_info = &tmp_a_info; matrix_b_info = &tmp_b_info; // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()))); + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape( + *a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info)); // Validate transpose kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width))); + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape( + *b, mult_transpose1xW_width))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info)); } // Validate matrix multiply - auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); + auto_init_if_empty(tmp_output_info, + matrix_a_info->clone()->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); - if(is_c_bias) + if (is_c_bias) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE)); } } // Validate matrix addition kernel - if(run_addition) + if (run_addition) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta)); } // Validate activation const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) + if (activation.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation)); } @@ -312,15 +352,15 @@ void CpuGemm::run(ITensorPack &tensors) auto c = tensors.get_const_tensor(ACL_SRC_2); auto d = tensors.get_tensor(ACL_DST); - if(_asm_glue && _asm_glue->is_configured()) + if (_asm_glue && _asm_glue->is_configured()) { // Pass c to asm dispatch only if it's the bias tensor ITensorPack asm_pack = tensors; asm_pack.add_const_tensor(ACL_SRC_2, _run_bias_addition ? c : nullptr); _asm_glue->run(asm_pack); - if(_run_alpha_scale) + if (_run_alpha_scale) { - ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } }; + ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}}; _alpha_scale_func->run(pack); } } @@ -330,18 +370,20 @@ void CpuGemm::run(ITensorPack &tensors) CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true); CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true); - ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } }; - if(!_run_vector_matrix_multiplication) + ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}}; + if (!_run_vector_matrix_multiplication) { // Run interleave kernel - ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } }; - NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack); + ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}}; + NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), + interleave_pack); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack); + ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), + transpose_pack); } // Use reshaped matrices @@ -349,48 +391,52 @@ void CpuGemm::run(ITensorPack &tensors) mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get()); } - NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack); + NEScheduler::get().schedule_op(_mm_kernel.get(), + _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, + _mm_kernel->window(), mm_pack); // Run bias addition kernel - if(_run_bias_addition) + if (_run_bias_addition) { - ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } }; + ITensorPack pack{{ACL_SRC_0, temp_d.get()}, {ACL_SRC_1, c}, {ACL_DST, d}}; _add_bias->run(pack); } } // Run matrix addition kernel - if(_run_addition) + if (_run_addition) { - ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } }; + ITensorPack c_add_pack{{ACL_SRC, c}, {ACL_DST, d}}; NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack); } // Run activation function - if(_run_activation) + if (_run_activation) { - ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } }; + ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}}; _activation_func->run(pack); } } void CpuGemm::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { - if(_asm_glue && _asm_glue->is_configured()) + if (_asm_glue && _asm_glue->is_configured()) { _asm_glue->prepare(tensors); } - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication) + else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication) { - const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); - ITensor *b_aux = utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(TransposedRHS))); + const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); + ITensor *b_aux = + utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(TransposedRHS))); ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux); CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux); - ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack); + ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), + transpose_pack); } _is_prepared = true; } @@ -401,8 +447,12 @@ experimental::MemoryRequirements CpuGemm::workspace() const return _aux_mem; } -Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - const GEMMInfo &gemm_info) +Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const GEMMInfo &gemm_info) { const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h index 9b08e5d0f6..6b30d134fa 100644 --- a/src/cpu/operators/CpuGemm.h +++ b/src/cpu/operators/CpuGemm.h @@ -24,12 +24,12 @@ #ifndef ARM_COMPUTE_CPU_GEMM_H #define ARM_COMPUTE_CPU_GEMM_H -#include "src/cpu/ICpuOperator.h" - #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/function_info/GEMMInfo.h" + +#include "src/cpu/ICpuOperator.h" #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" #include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h" #include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h" @@ -93,16 +93,26 @@ public: * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should happen only for the first run */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm. * * Similar to @ref CpuGemm::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. * @@ -111,12 +121,16 @@ public: * the value of arm_compute::WeightFormat need to be passed via the * parameter gemm_info. */ - static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - const GEMMInfo &gemm_info = GEMMInfo()); + static Status has_opt_impl(arm_compute::WeightFormat &weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; /** Indicates if the convolution executes in variable weights mode. @@ -138,28 +152,28 @@ private: Count }; - std::unique_ptr _interleave_kernel{ nullptr }; - std::unique_ptr _transpose_kernel{ nullptr }; - std::unique_ptr _mm_kernel{ nullptr }; - std::unique_ptr _asm_glue{ nullptr }; - std::unique_ptr _ma_kernel{ nullptr }; - std::unique_ptr _alpha_scale_func{ nullptr }; - std::unique_ptr _add_bias{ nullptr }; - std::unique_ptr _activation_func{ nullptr }; + std::unique_ptr _interleave_kernel{nullptr}; + std::unique_ptr _transpose_kernel{nullptr}; + std::unique_ptr _mm_kernel{nullptr}; + std::unique_ptr _asm_glue{nullptr}; + std::unique_ptr _ma_kernel{nullptr}; + std::unique_ptr _alpha_scale_func{nullptr}; + std::unique_ptr _add_bias{nullptr}; + std::unique_ptr _activation_func{nullptr}; TensorInfo _tmp_a{}; TensorInfo _tmp_b{}; TensorInfo _tmp_d{}; - bool _run_vector_matrix_multiplication{ false }; - bool _run_alpha_scale{ false }; - bool _run_addition{ false }; - bool _run_bias_addition{ false }; - bool _run_activation{ false }; - bool _reshape_b_only_on_first_run{ false }; - bool _is_prepared{ false }; + bool _run_vector_matrix_multiplication{false}; + bool _run_alpha_scale{false}; + bool _run_addition{false}; + bool _run_bias_addition{false}; + bool _run_activation{false}; + bool _reshape_b_only_on_first_run{false}; + bool _is_prepared{false}; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp index 39b410d609..7c59d88c61 100644 --- a/src/cpu/operators/CpuGemmConv2d.cpp +++ b/src/cpu/operators/CpuGemmConv2d.cpp @@ -26,9 +26,9 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" @@ -52,8 +52,11 @@ namespace arm_compute { namespace cpu { -CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info, - const Size2D &dilation, const ActivationLayerInfo &act_info) +CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, + const ITensorInfo *weights, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info) { const DataLayout data_layout = src->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -62,63 +65,86 @@ CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, const unsigned int kernel_height = weights->dimension(idx_height); unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - - if(skip_im2col) + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv_info.stride().first == 1 && conv_info.stride().second == 1); + + if (skip_im2col) { - const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true)))); - if(skip_col2im) + const bool skip_col2im = + (data_layout == DataLayout::NHWC && + (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true)))); + if (skip_col2im) { - return { true, true }; + return {true, true}; } } else { - const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false)))); - if(skip_col2im) + const bool skip_col2im = + (data_layout == DataLayout::NHWC && + (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false)))); + if (skip_col2im) { - return { false, true }; + return {false, true}; } } // Default case when we cannot reinterpret the input and output as 3D. - return { false, false }; + return {false, false}; } CpuGemmConv2d::CpuGemmConv2d() - : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(), - _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count) + : _weights_reshape_kernel(nullptr), + _im2col_kernel(), + _mm_gemm(), + _mm_gemmlowp(), + _col2im_kernel(), + _reshape(), + _im2col_output(), + _weights_reshaped(), + _gemm_output(), + _gemm_output_3d(), + _data_layout(DataLayout::NCHW), + _skip_im2col(false), + _skip_col2im(false), + _is_quantized(false), + _is_prepared(false), + _aux_mem(AuxTensorIdx::Count) { } CpuGemmConv2d::~CpuGemmConv2d() = default; -void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info, - bool enable_fast_math, int gemm_3d_depth, bool fixed_format, arm_compute::WeightFormat weight_format) +void CpuGemmConv2d::configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + int gemm_3d_depth, + bool fixed_format, + arm_compute::WeightFormat weight_format) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, _skip_im2col, fixed_format, weight_format)); + ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, + _skip_im2col, fixed_format, weight_format)); // Create GEMMInfo structure - const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format); + const GEMMInfo &gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(), + false, enable_fast_math, false, act_info, fixed_format, weight_format); // Supported activations in GEMM - const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; + const std::set supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; - if(_is_quantized) + if (_is_quantized) { - TensorInfo tmp_src{ *src }; - TensorInfo tmp_weights{ *weights }; + TensorInfo tmp_src{*src}; + TensorInfo tmp_weights{*weights}; // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset const QuantizationInfo iqinfo = src->quantization_info(); @@ -129,7 +155,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig const DataType data_type = src->data_type(); tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset)); - if(!is_data_type_quantized_per_channel(tmp_weights.data_type())) + if (!is_data_type_quantized_per_channel(tmp_weights.data_type())) { const UniformQuantizationInfo uwqinfo = wqinfo.uniform(); tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset)); @@ -142,7 +168,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig int32_t min_activation = type_min.get(); int32_t max_activation = type_max.get(); - if(supported_acts.count(act_info.activation()) != 0) + if (supported_acts.count(act_info.activation()) != 0) { std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); } @@ -156,11 +182,12 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); _mm_gemmlowp = std::make_unique(); - _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info, fixed_format, - weight_format)); + _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, + GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, + enable_fast_math, false, act_info, fixed_format, weight_format)); auto mm_mem_req = _mm_gemmlowp->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } @@ -171,26 +198,35 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig _mm_gemm = std::make_unique(); _mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info); auto mm_mem_req = _mm_gemm->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } } } -Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const ActivationLayerInfo &act_info, bool enable_fast_math, int gemm_3d_depth, bool skip_im2col, bool fixed_format, arm_compute::WeightFormat weight_format) +Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + int gemm_3d_depth, + bool skip_im2col, + bool fixed_format, + arm_compute::WeightFormat weight_format) { const DataType data_type = src->data_type(); const bool is_quantized = is_data_type_quantized_asymmetric(data_type); const bool is_activation_enabled = act_info.enabled(); // Create GEMMInfo structure - const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format); + const GEMMInfo gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(), + false, enable_fast_math, false, act_info, fixed_format, weight_format); - if(is_quantized) + if (is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -206,11 +242,10 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei int32_t min_activation = type_min.get(); int32_t max_activation = type_max.get(); - const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0) + const std::set supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + if (is_activation_enabled && supported_acts.count(act_info.activation()) != 0) { std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); } @@ -229,8 +264,9 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset)); weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset)); - return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, enable_fast_math, - false, act_info)); + return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, + GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, + output_info, false, enable_fast_math, false, act_info)); } else { @@ -239,36 +275,44 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei } } -Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col) +Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const ActivationLayerInfo &act_info, + int gemm_3d_depth, + bool skip_im2col) { const DataType data_type = input_info->data_type(); const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth; const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U; // Set dummy tensor shapes for the validation - const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info()); + const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, + input_info->quantization_info()); const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info()); - const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info()); + const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, + input_info->quantization_info()); - return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, gemm_3d_depth, skip_im2col); + return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, + gemm_3d_depth, skip_im2col); } -void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CpuGemmConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_UNUSED(num_groups, weights_info); - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, - weights, - biases, - dst, - conv_info, - weights_info, - dilation, - act_info, - enable_fast_math, - num_groups)); - ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, weights, biases, dst, conv_info, weights_info, dilation, + act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, + num_groups); const DataType data_type = src->data_type(); const DataLayout data_layout = src->data_layout(); @@ -283,7 +327,8 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights _is_prepared = weights_info.retain_internal_weights(); _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); _data_layout = data_layout; - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); + _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv_info.stride().first == 1 && conv_info.stride().second == 1); const ITensorInfo *gemm_input_to_use = src; ITensorInfo *gemm_output_to_use = dst; @@ -291,20 +336,17 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights // Get convolved dimensions unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h), "Output shape does not match the expected one"); // Check if GEMM3D is supported - const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); - _skip_im2col = skip_info.skip_im2col; - _skip_col2im = skip_info.skip_col2im; + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + _skip_im2col = skip_info.skip_im2col; + _skip_col2im = skip_info.skip_col2im; // Get parameters from conv_info unsigned int stride_x = 0; @@ -320,17 +362,19 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights _weights_reshaped.set_quantization_info(weights->quantization_info()); // Create tensor to store im2col reshaped inputs - if(!_skip_im2col) + if (!_skip_im2col) { const int block_by = arm_compute::block_by(weights_info.weight_format()); unsigned int input_pad_right = 0; - if(block_by > 1) + if (block_by > 1) { - input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); + input_pad_right = + (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); } // Configure _im2col_kernel = std::make_unique(); - _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, num_groups, input_pad_right); + _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, + num_groups, input_pad_right); // Update GEMM input gemm_input_to_use = &_im2col_output; @@ -338,7 +382,7 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights // Create temporary GEMM output tensor in case we cannot skip col2im const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!_skip_col2im) + if (!_skip_col2im) { TensorShape shape_gemm; @@ -368,9 +412,10 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0; const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; - configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, gemm_3d_depth, fixed_format, weights_info.weight_format()); + configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, + gemm_3d_depth, fixed_format, weights_info.weight_format()); - if(!_skip_col2im && _data_layout == DataLayout::NCHW) + if (!_skip_col2im && _data_layout == DataLayout::NCHW) { // Configure col2im _col2im_kernel = std::make_unique(); @@ -390,14 +435,24 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights gemm_trans_wei = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS // Check lifetime - _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); - _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size()); - _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); + _aux_mem[Im2ColOutput] = + MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); + _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), + gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, + _weights_reshaped.total_size()); + _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); } -Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math) +Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + const bool enable_fast_math) { const DataLayout data_layout = src->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -406,36 +461,44 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo const unsigned int kernel_height = weights->dimension(idx_height); unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); - const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, - dilation, act_info); + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); const bool skip_im2col = skip_info.skip_im2col; const bool skip_col2im = skip_info.skip_col2im; const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0; const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; - const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format()); + const GEMMInfo gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(), + false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format()); return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info); } -Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status CpuGemmConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, + DataType::F16, DataType::F32); - if(!is_fixed_format(weights_info.weight_format())) + if (!is_fixed_format(weights_info.weight_format())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); } @@ -468,29 +531,25 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); // Check if GEMM3D is supported - const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, - dilation, act_info); - const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im; + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im; ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } - else if(is_bf16) + else if (is_bf16) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); } @@ -503,20 +562,23 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight } unsigned int mat_weights_cols = weights->dimension(idx_kernels); - unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel); + unsigned int mat_weights_rows = + weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel); weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, weights->data_type()); weights_reshaped_info.set_quantization_info(weights->quantization_info()); weights_to_use = &weights_reshaped_info; - if(!skip_im2col) + if (!skip_im2col) { const int block_by = arm_compute::block_by(weights_info.weight_format()); int input_pad_right = 0; - if(block_by > 1) + if (block_by > 1) { - input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); - mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * (weights->dimension(idx_channel) + input_pad_right); + input_pad_right = + (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); + mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * + (weights->dimension(idx_channel) + input_pad_right); } // Create tensor info for im2col reshaped inputs @@ -528,13 +590,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type); im2col_reshaped_info.set_quantization_info(src->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups, input_pad_right)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), + conv_info, append_bias, dilation, num_groups, input_pad_right)); gemm_input_to_use = &im2col_reshaped_info; } // Create temporary GEMM output tensor in case we cannot skip col2im const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!skip_col2im) + if (!skip_col2im) { TensorShape shape_gemm = gemm_input_to_use->tensor_shape(); shape_gemm.set(0, mat_weights_cols); @@ -549,13 +613,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight gemm_output_to_use = &info_gemm; const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, + enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format, weights_info.weight_format())); // Validate Col2Im/ReshapeLayer - if(!skip_col2im && (data_layout == DataLayout::NCHW)) + if (!skip_col2im && (data_layout == DataLayout::NCHW)) { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h))); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h))); } return Status{}; @@ -574,15 +640,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors) CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false); bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0); - if(!_skip_im2col) + if (!_skip_im2col) { // Run input reshaping unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - ITensorPack pack = - { - { TensorType::ACL_SRC, src }, - { TensorType::ACL_DST, im2col_output.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}}; NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack); gemm_input_to_use = im2col_output.get(); } @@ -595,11 +657,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors) gemm3d.allocator()->import_memory(out_to_use->buffer()); auto gemm_output_to_use = gemm_output.get(); - if(_skip_im2col) + if (_skip_im2col) { gemm_output_to_use = &gemm3d; } - if(_skip_col2im && !out_has_padding) + if (_skip_col2im && !out_has_padding) { gemm_output_to_use = dst; } @@ -607,12 +669,12 @@ void CpuGemmConv2d::run(ITensorPack &tensors) // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions ITensorPack pack_mm = tensors; pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); - if(!this->isVarWeightsKernel()) + if (!this->isVarWeightsKernel()) { pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); } pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use); - if(_is_quantized) + if (_is_quantized) { // Run gemmlowp _mm_gemmlowp->run(pack_mm); @@ -624,45 +686,33 @@ void CpuGemmConv2d::run(ITensorPack &tensors) } // Reshape output matrix - if(!_skip_col2im) + if (!_skip_col2im) { - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output.get() }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output.get()}, {TensorType::ACL_DST, dst}}; NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack); } else { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output_to_use }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; _reshape->run(pack); } } - else if(out_has_padding) + else if (out_has_padding) { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output_to_use }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; _reshape->run(pack); } } void CpuGemmConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { // Variable weights executions that use fixed-format kernels // need no reshaping of the weights. - if(this->isVarWeightsKernel()) + if (this->isVarWeightsKernel()) { _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors); _is_prepared = true; @@ -672,11 +722,7 @@ void CpuGemmConv2d::prepare(ITensorPack &tensors) // Run weights reshaping and mark original weights tensor as unused CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors); auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - ITensorPack pack = - { - { TensorType::ACL_SRC, weights }, - { TensorType::ACL_DST, weights_reshaped.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}}; NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack); weights->mark_as_unused(); ITensorPack gemm_pack = tensors; diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h index 61fe63a79f..118d366517 100644 --- a/src/cpu/operators/CpuGemmConv2d.h +++ b/src/cpu/operators/CpuGemmConv2d.h @@ -27,6 +27,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" #include @@ -106,17 +107,32 @@ public: * available which may introduce a drop of accuracy as well. Default is false * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmConvolution::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, unsigned int num_groups = 1); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. * @@ -124,10 +140,16 @@ public: * * @return a status. */ - static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), - const bool enable_fast_math = false); + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const bool enable_fast_math = false); // Inherited methods overridden: void run(ITensorPack &tensors) override; @@ -150,8 +172,15 @@ private: * @param[in] fixed_format (Optional) Select GEMM execution with variable weights. * @param[in] weight_format (Optional) The layout to be used for the weights tensor when running GEMM with variable weights. */ - void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, int gemm_3d_depth = 1, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); + void configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + int gemm_3d_depth = 1, + bool fixed_format = false, + arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines * * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. @@ -170,8 +199,16 @@ private: * * @return a status */ - static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, int gemm_3d_depth = 1, bool skip_im2col = false, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); + static Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + int gemm_3d_depth = 1, + bool skip_im2col = false, + bool fixed_format = false, + arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore * * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. @@ -182,7 +219,11 @@ private: * * @return a status */ - static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col); + static Status validate_gemm3d(const ITensorInfo *src, + const ITensorInfo *weights, + const ActivationLayerInfo &act_info, + int gemm_3d_depth, + bool skip_im2col); struct SkipInfo { @@ -200,8 +241,11 @@ private: * * @return a SkipInfo instance. */ - static SkipInfo skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info, - const Size2D &dilation, const ActivationLayerInfo &act_info); + static SkipInfo skip_im_col_info(const ITensorInfo *src, + const ITensorInfo *weights, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info); /** Indicates if the convolution executes in variable weights mode. * @@ -236,7 +280,7 @@ private: bool _is_quantized; bool _is_prepared; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp index 5ce285cb6f..8fa81b1907 100644 --- a/src/cpu/operators/CpuGemmDirectConv2d.cpp +++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp @@ -26,10 +26,10 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" - #include "support/Cast.h" #include @@ -43,7 +43,10 @@ using namespace arm_compute::utils::cast; namespace { -GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act) +GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -53,16 +56,15 @@ GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); const DataType data_type = src->data_type(); // Merge activation with output stage - const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - PixelValue type_min{}; - PixelValue type_max{}; + const std::set supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + PixelValue type_min{}; + PixelValue type_max{}; std::tie(type_min, type_max) = get_min_max(data_type); int32_t min_activation = type_min.get(); int32_t max_activation = type_max.get(); - if(supported_acts.count(act.activation()) != 0) + if (supported_acts.count(act.activation()) != 0) { std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo); } @@ -107,31 +109,32 @@ CpuGemmDirectConv2d::CpuGemmDirectConv2d() CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default; -void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info) +void CpuGemmDirectConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src, - weights, - biases != nullptr ? biases : nullptr, - dst, - info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuGemmDirectConv2d::validate(src, weights, biases != nullptr ? biases : nullptr, dst, info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info); _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info); _is_prepared = false; - _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 }); + _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{3, 0, 1, 2}); // Configure assembly dispatch cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info); } _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info); // Configure activation - if(_run_activation) + if (_run_activation) { _activation_func->configure(dst, nullptr, info.act_info); } @@ -141,24 +144,33 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *w _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; _aux_mem[Pretranspose] = asm_mem_req[Pretranspose]; - if(_aux_mem[Pretranspose].size > 0) + if (_aux_mem[Pretranspose].size > 0) { // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size()); + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size()); } else { // We must permute weights if they are WeightFormat::UNSPECIFIED - if(info.weights_info.weight_format() == WeightFormat::UNSPECIFIED) - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size()); + if (info.weights_info.weight_format() == WeightFormat::UNSPECIFIED) + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size()); } } -Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info) +Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); - if(!is_fixed_format(info.weights_info.weight_format())) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, + DataType::F16, DataType::F32); + if (!is_fixed_format(info.weights_info.weight_format())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); } @@ -171,13 +183,13 @@ Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo * ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } - else if(data_type == DataType::BFLOAT16) + else if (data_type == DataType::BFLOAT16) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); } @@ -198,31 +210,32 @@ void CpuGemmDirectConv2d::run(ITensorPack &tensors) prepare(tensors); _gemm_asm_func->run(tensors); - if(_run_activation) + if (_run_activation) { ITensor *io = tensors.get_tensor(ACL_DST); - ITensorPack pack{ { ACL_SRC, io }, { ACL_DST, io } }; + ITensorPack pack{{ACL_SRC, io}, {ACL_DST, io}}; _activation_func->run(pack); } } void CpuGemmDirectConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { // If we are using fixed-format kernel the weights are already reshaped - if(_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel()) + if (_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel()) { _gemm_asm_func->prepare(tensors); _is_prepared = true; return; } - const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); - ITensor *weights_aux = utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(PermutedWeights))); + const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); + ITensor *weights_aux = + utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(PermutedWeights))); ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux); CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux); - ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } }; + ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}}; _weights_permute_func->run(permute_tensors); tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get()); diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h index e55a461f36..1cc3caadae 100644 --- a/src/cpu/operators/CpuGemmDirectConv2d.h +++ b/src/cpu/operators/CpuGemmDirectConv2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H #include "arm_compute/core/TensorInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/operators/CpuActivation.h" @@ -69,18 +70,26 @@ public: * Data types supported: Same as @p input. * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d * * Similar to CpuGemmDirectConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp index 8ca128fb07..2ee879b67b 100644 --- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp @@ -28,14 +28,14 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/TensorAllocator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/MemoryHelpers.h" #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h" #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" #include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h" @@ -59,12 +59,12 @@ namespace cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) { cpu::AsmGemmInfo asm_info; - asm_info.method = cpu::AsmConvMethod::Im2Col; - asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); - asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); - asm_info.activation_info = info.activation_info(); - asm_info.output_stage = info.gemmlowp_output_stage(); - asm_info.fast_mode = info.fast_math(); + asm_info.method = cpu::AsmConvMethod::Im2Col; + asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); + asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); + asm_info.activation_info = info.activation_info(); + asm_info.output_stage = info.gemmlowp_output_stage(); + asm_info.fast_mode = info.fast_math(); return asm_info; } @@ -105,7 +105,8 @@ CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore() } CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default; -void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info) +void CpuGemmLowpMatrixMultiplyCore::configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst); ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info)); @@ -122,28 +123,31 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso _reshape_b_only_on_first_run = b->are_values_constant(); _is_prepared = false; _fused_assembly_path = false; - _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run; - _gemm_info = gemm_info; + _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && + _reshape_b_only_on_first_run; + _gemm_info = gemm_info; _asm_glue = std::make_unique(); const ITensorInfo *a_to_use = a; // Convert to QASYMM8 -> QASYMM8_SIGNED and back - if(_flip_signedness) + if (_flip_signedness) { const int32_t offset_correction = 128; const DataType dt = DataType::QASYMM8_SIGNED; const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); - _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); _convert_to_signed_asymm = std::make_unique(); _convert_to_signed_asymm->configure(a_to_use, &_signed_a); a_to_use = &_signed_a; _a_offset = _signed_a.quantization_info().uniform().offset; const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); + _signed_output = dst->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); // Output stage correction GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); @@ -157,7 +161,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { _fuse_output_stage = true; _mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32); @@ -166,16 +170,18 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso // Initialize assembly kernel meta-data const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); #ifdef __aarch64__ - if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. + if (!(!b->are_values_constant() && + b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. { - switch(a->data_type()) + switch (a->data_type()) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: case DataType::U8: case DataType::S8: { - if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (is_data_type_quantized_asymmetric(a_to_use->data_type()) && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { auto c_info_to_use = c == nullptr ? nullptr : c; _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info); @@ -197,13 +203,14 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } } #endif /* __aarch64__ */ - if(!(_assembly_path || _run_vector_matrix_multiplication)) + if (!(_assembly_path || _run_vector_matrix_multiplication)) { matrix_a = &_tmp_a; matrix_b = &_tmp_b; // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info()); + _tmp_a = + TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info()); // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info()); @@ -216,13 +223,13 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso _mtx_b_reshape_kernel->configure(b, &_tmp_b); } - if(!_fused_assembly_path) + if (!_fused_assembly_path) { // Build reduction info const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) + if (_a_offset != 0) { _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); @@ -232,7 +239,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32); @@ -241,24 +248,23 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info); } - if(_fuse_output_stage) + if (_fuse_output_stage) { // Configure matrix multiply kernel - if(!_assembly_path) + if (!_assembly_path) { _mm_kernel = std::make_unique(); _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32); } - _offset_contribution_output_stage_kernel = std::make_unique(); - _offset_contribution_output_stage_kernel->configure(&_mm_result_s32, - _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, - _flip_signedness ? &_signed_output : dst, - a->dimension(0), - _a_offset, _b_offset, info.gemmlowp_output_stage()); + _offset_contribution_output_stage_kernel = + std::make_unique(); + _offset_contribution_output_stage_kernel->configure( + &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c, _flip_signedness ? &_signed_output : dst, + a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage()); - if(_flip_signedness) + if (_flip_signedness) { _convert_from_signed_asymm = std::make_unique(); _convert_from_signed_asymm->configure(&_signed_output, dst); @@ -267,27 +273,29 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso else { // Configure matrix multiply kernel - if(!_assembly_path) + if (!_assembly_path) { _mm_kernel = std::make_unique(); _mm_kernel->configure(matrix_a, matrix_b, dst); } // Configure offset contribution kernel _offset_contribution_kernel = std::make_unique(); - _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0), + _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0), _a_offset, _b_offset); } } // Configure activation const ActivationLayerInfo &activation = gemm_info.activation_info(); - _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); - if(_run_activation) + _run_activation = + activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); + if (_run_activation) { _activation_func = std::make_unique(); _activation_func->configure(dst, nullptr, activation); } - if(_assembly_path) + if (_assembly_path) { auto asm_mem_req = _asm_glue->workspace(); _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; @@ -295,27 +303,41 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } // Request memory for LHS and RHS reshape matrix - _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0 - && _reshape_b_only_on_first_run ? - MemoryLifetime::Persistent : - MemoryLifetime::Temporary, - _vector_sum_col.total_size()); - _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); - _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size()); - _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); - _aux_mem[MMResultS32] = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); - _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size()); - _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size()); + _aux_mem[VectorSumCol] = + MemoryInfo(offset_int_vec(VectorSumCol), + !_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run ? MemoryLifetime::Persistent + : MemoryLifetime::Temporary, + _vector_sum_col.total_size()); + _aux_mem[VectorSumRow] = + MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); + _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size()); + _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _tmp_b.total_size()); + _aux_mem[MMResultS32] = + MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); + _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size()); + _aux_mem[SignedOutput] = + MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size()); } -Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), - "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && + gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, + "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (a)->dimension(0) != (b)->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); @@ -333,28 +355,32 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens int32_t b_offset = b->quantization_info().uniform().offset; bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; - if(fuse_output_stage) + if (fuse_output_stage) { - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); + auto_init_if_empty(mm_result_s32_info, + a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); } // Convert QASYMM8->QASYMM8_SIGNED TensorInfo signed_a{}; TensorInfo signed_output{}; - bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); - if(flip_signedness) + bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && + (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); + if (flip_signedness) { const int32_t offset_correction = 128; const DataType dt = DataType::QASYMM8_SIGNED; const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); - signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a)); a_to_use = &signed_a; a_offset = signed_a.quantization_info().uniform().offset; const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); - signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); + signed_output = output->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); // Output stage correction GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); @@ -374,25 +400,28 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens bool run_optimised = false; bool run_optimised_requantized = false; - if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. + if (!(!b->are_values_constant() && + b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. { - if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (is_data_type_quantized_asymmetric(a_to_use->data_type()) && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info)); run_optimised_requantized = run_optimised; } else { - run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info)); + run_optimised = bool(CpuGemmAssemblyDispatch::validate( + a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info)); } } - if(run_optimised) + if (run_optimised) { ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); - if(info.depth_output_gemm3d() != 0) + if (info.depth_output_gemm3d() != 0) { - if(info.reinterpret_input_as_3d()) + if (info.reinterpret_input_as_3d()) { ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); @@ -409,11 +438,13 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), + "NEGEMM cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, + "NEGEMM cannot reinterpret the output tensor as 3D"); const bool run_vector_matrix_multiplication = a->dimension(1) < 2; - if(!run_vector_matrix_multiplication) + if (!run_vector_matrix_multiplication) { matrix_a_info = &tmp_a_info; matrix_b_info = &tmp_b_info; @@ -437,7 +468,7 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens } } - if(!run_optimised_requantized) + if (!run_optimised_requantized) { TensorInfo info_vector_sum_col{}; TensorInfo info_vector_sum_row{}; @@ -445,62 +476,70 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if(a_offset != 0) + if (a_offset != 0) { info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); } // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if(b_offset != 0) + if (b_offset != 0) { info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); } - if(fuse_output_stage) + if (fuse_output_stage) { - if(!run_optimised) + if (!run_optimised) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.reinterpret_input_as_3d(), + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.depth_output_gemm3d() != 0, + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); + + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - flip_signedness ? &signed_output : output, - a_offset, b_offset, - info.gemmlowp_output_stage())); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, flip_signedness ? &signed_output : output, a_offset, + b_offset, info.gemmlowp_output_stage())); } else { - if(!run_optimised) + if (!run_optimised) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.reinterpret_input_as_3d(), + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.depth_output_gemm3d() != 0, + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate( + output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, + a_offset, b_offset)); } } // Validate activation const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) + if (activation.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation)); } @@ -529,24 +568,22 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false); // Convert QASYMM8->QASYMM8_SIGNED - if(_flip_signedness) + if (_flip_signedness) { - ITensorPack pack = - { - { TensorType::ACL_SRC, a }, - { TensorType::ACL_DST, signed_a.get() } - }; - NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}}; + NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), + pack); a_to_use = signed_a.get(); matrix_a = signed_a.get(); } // Run GEMM - if(_asm_glue->is_configured()) + if (_asm_glue->is_configured()) { ITensorPack asm_glue_tensors = tensors; auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst); - if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && + _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); @@ -563,35 +600,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) } else { - if(!_run_vector_matrix_multiplication) + if (!_run_vector_matrix_multiplication) { matrix_a = tmp_a.get(); matrix_b = tmp_b.get(); // Run interleave kernel - ITensorPack pack_a = - { - { TensorType::ACL_SRC, a_to_use }, - { TensorType::ACL_DST, tmp_a.get() } - }; - NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a); + ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}}; + NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), + pack_a); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { - ITensorPack pack_b = - { - { TensorType::ACL_SRC, b }, - { TensorType::ACL_DST, tmp_b.get() } - }; + ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}}; // Run transpose kernel - NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b); + NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, + _mtx_b_reshape_kernel->window(), pack_b); } } - ITensorPack pack_mm = - { - { TensorType::ACL_SRC_0, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b } - }; - if(_fuse_output_stage) + ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}}; + if (_fuse_output_stage) { pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get()); } @@ -602,31 +629,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm); } - if(!_fused_assembly_path) + if (!_fused_assembly_path) { // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { - ITensorPack pack = - { - { TensorType::ACL_SRC, a_to_use }, - { TensorType::ACL_DST, vector_sum_row.get() } - }; - NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}}; + NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, + _mtx_a_reduction_kernel->window(), pack); } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) + if (_a_offset != 0 && !_reshape_b_only_on_first_run) { - ITensorPack pack = - { - { TensorType::ACL_SRC, b }, - { TensorType::ACL_DST, vector_sum_col.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, + _mtx_b_reduction_kernel->window(), pack); } - if(_fuse_output_stage) + if (_fuse_output_stage) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get()); @@ -636,7 +657,8 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst); // Run offset contribution kernel - NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack); + NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, + _offset_contribution_output_stage_kernel->window(), pack); } else { @@ -646,68 +668,57 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) pack.add_tensor(TensorType::ACL_DST, dst); // Run offset contribution kernel - NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack); + NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, + _offset_contribution_kernel->window(), pack); } } // Convert QASYMM8_SIGNED->QASYMM8 - if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness) + if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness) { - ITensorPack pack = - { - { TensorType::ACL_SRC, signed_output.get() }, - { TensorType::ACL_DST, dst } - }; - NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}}; + NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, + _convert_from_signed_asymm->window(), pack); } // Run fused activation unless already run in the fused assembly - if(_run_activation) + if (_run_activation) { - ITensorPack pack = - { - { TensorType::ACL_SRC, dst }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}}; _activation_func->run(pack); } } void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1); // Run assembly reshape - if(_asm_glue->is_configured()) + if (_asm_glue->is_configured()) { _asm_glue->prepare(tensors); } // Run non-assembly reshape - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) + else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) { // Run reshape kernel and mark original weights tensor as unused - ITensor *tmp_b_p = utils::cast::polymorphic_downcast(tensors.get_tensor(offset_int_vec(TmpB))); + ITensor *tmp_b_p = utils::cast::polymorphic_downcast(tensors.get_tensor(offset_int_vec(TmpB))); CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p); - ITensorPack pack = - { - { TensorType::ACL_SRC, original_b }, - { TensorType::ACL_DST, tmp_b.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), + pack); } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) + if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) { - ITensor *vector_sum_col_p = utils::cast::polymorphic_downcast(tensors.get_tensor(offset_int_vec(VectorSumCol))); + ITensor *vector_sum_col_p = + utils::cast::polymorphic_downcast(tensors.get_tensor(offset_int_vec(VectorSumCol))); CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p); - ITensorPack pack = - { - { TensorType::ACL_SRC, original_b }, - { TensorType::ACL_DST, vector_sum_col.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, + _mtx_b_reduction_kernel->window(), pack); } _is_prepared = true; } diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h index a1b34291d0..a7798938e7 100644 --- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/function_info/GEMMInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -108,18 +109,26 @@ public: * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should be executed only for the first run */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo()); + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *dst, + const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmLowpMatrixMultiplyCore::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *dst, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp index 58f98acff0..4215eed199 100644 --- a/src/cpu/operators/CpuGemmLowpOutputStage.cpp +++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h" #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" @@ -36,36 +37,42 @@ namespace arm_compute { namespace cpu { -void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +void CpuGemmLowpOutputStage::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info)); ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { - switch(info.output_data_type) + switch (info.output_data_type) { case DataType::QASYMM8: { auto k = std::make_unique(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, + info.gemmlowp_min_bound, info.gemmlowp_max_bound); _kernel = std::move(k); break; } case DataType::QASYMM8_SIGNED: { auto k = std::make_unique(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, + info.gemmlowp_min_bound, info.gemmlowp_max_bound); _kernel = std::move(k); break; } case DataType::QSYMM16: { auto k = std::make_unique(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, + info.gemmlowp_max_bound); _kernel = std::move(k); break; } @@ -79,7 +86,7 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen } case GEMMLowpOutputStageType::QUANTIZE_DOWN: { - switch(info.output_data_type) + switch (info.output_data_type) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: @@ -102,32 +109,41 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen } } -Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type."); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, + "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type."); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && + (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { - switch(dst->data_type()) + switch (dst->data_type()) { case DataType::QASYMM8: - return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); case DataType::QASYMM8_SIGNED: - return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); case DataType::QSYMM16: - return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); default: return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); } } case GEMMLowpOutputStageType::QUANTIZE_DOWN: { - switch(dst->data_type()) + switch (dst->data_type()) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: @@ -146,4 +162,4 @@ void CpuGemmLowpOutputStage::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h index 39394f6b5f..e5e2f41fa9 100644 --- a/src/cpu/operators/CpuGemmLowpOutputStage.h +++ b/src/cpu/operators/CpuGemmLowpOutputStage.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H #include "arm_compute/core/Types.h" + #include "src/cpu/ICpuOperator.h" /** This file contains all available output stages for GEMMLowp. @@ -76,7 +77,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp index 8811a7ea6b..89087129c3 100644 --- a/src/cpu/operators/CpuMatMul.cpp +++ b/src/cpu/operators/CpuMatMul.cpp @@ -23,14 +23,16 @@ */ #include "src/cpu/operators/CpuMatMul.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" + #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/function_info/MatMulInfo.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEMatMul.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,8 +48,11 @@ namespace cpu { namespace { -Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act, - GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) +Status get_gemmlowp_output_stage_info(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) { const auto data_type = src->data_type(); const QuantizationInfo oq_info = dst->quantization_info(); @@ -59,10 +64,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo int32_t output_multiplier; int32_t output_shift; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - int32_t type_min = 0; - int32_t type_max = 0; + int32_t type_min = 0; + int32_t type_max = 0; std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type); gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; @@ -77,14 +83,27 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo } // namespace CpuMatMul::CpuMatMul() - : _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape() + : _transpose_kernel_lhs(), + _transpose_kernel_rhs(), + _asm_glue(), + _lhs_transposed(), + _rhs_transposed(), + _original_lhs_shape(), + _original_rhs_shape(), + _original_dst_shape() { } -Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +Status CpuMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic."); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs); @@ -103,34 +122,39 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const gemm_info.fast_mode = settings.fast_math(); // Validate and then permute a/b - if(adj_lhs) + if (adj_lhs) { - auto_init_if_empty(lhs_transposed, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs))); + auto_init_if_empty(lhs_transposed, + lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed)); // Assign lhs_to_use pointer to use transposed TensorInfo lhs_to_use = &lhs_transposed; } - if(adj_rhs) + if (adj_rhs) { - auto_init_if_empty(rhs_transposed, rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs))); + auto_init_if_empty(rhs_transposed, + rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed)); // Assign rhs_to_use pointer to use transposed TensorInfo rhs_to_use = &rhs_transposed; } ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1), - "The product AB is defined only if the number of columns in A is equal to the number of rows in B (after transpose)"); + "The product AB is defined only if the number of columns in A is equal to the " + "number of rows in B (after transpose)"); // Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors - for(unsigned int i = 2; i < Coordinates::num_max_dimensions; i++) + for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), "Broadcasting in Batch dimension is unsupported by this operator."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), + "Broadcasting in Batch dimension is unsupported by this operator."); } // Quantized-specific configuration - if(is_data_type_quantized(lhs->data_type())) + if (is_data_type_quantized(lhs->data_type())) { - ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, gemm_info.activation_info, gemm_info.output_stage)); + ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, + gemm_info.activation_info, gemm_info.output_stage)); } cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info); @@ -138,7 +162,12 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const return Status{}; } -void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +void CpuMatMul::configure(ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings); @@ -163,21 +192,23 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, _original_rhs_shape = rhs_to_use.tensor_shape(); // Reshape lhs for use with assembly kernels. - lhs_to_use.set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); - dst_to_use.set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); + lhs_to_use.set_tensor_shape( + TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); + dst_to_use.set_tensor_shape( + TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2)); // 2. Configuration for transpose of lhs/rhs // ------------------------------------------------------ // Initialise transposed TensorInfo class for aux tensors (intermediary tensors) - if(_adj_lhs) + if (_adj_lhs) { // Setup transpose LHS _transpose_kernel_lhs = std::make_unique(); _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed); } - if(_adj_rhs) + if (_adj_rhs) { // Setup transpose RHS _transpose_kernel_rhs = std::make_unique(); @@ -196,20 +227,22 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use; // Quantized-specific configuration - if(is_data_type_quantized(lhs->data_type())) + if (is_data_type_quantized(lhs->data_type())) { - get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, _gemm_info.output_stage); + get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, + _gemm_info.output_stage); } // Configure Asm Kernel _asm_glue = std::make_unique(); - _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, _gemm_info); // c is nullptr as bias not supported in MatMul + _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, + _gemm_info); // c is nullptr as bias not supported in MatMul // Specify memory requirements for intermediate tensors auto asm_mem_req = _asm_glue->workspace(); // Specify memory required by gemm kernel int idx = 0; - for(const auto &aux : asm_mem_req) + for (const auto &aux : asm_mem_req) { _aux_mem[idx] = aux; idx++; @@ -228,8 +261,12 @@ void CpuMatMul::run(ITensorPack &tensors) // Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm) // Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly) - lhs->info()->set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z - dst->info()->set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z + lhs->info()->set_tensor_shape( + TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, + _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z + dst->info()->set_tensor_shape( + TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, + _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2)); // Initialise object to handle stored transposed tensors in auxillary memory @@ -240,17 +277,19 @@ void CpuMatMul::run(ITensorPack &tensors) ITensorPack asm_tensors(tensors); // Run transpose lhs if necessary - if(_adj_lhs) + if (_adj_lhs) { - ITensorPack lhs_transpose_pack = { { TensorType::ACL_SRC, lhs }, { TensorType::ACL_DST, lhs_transposed.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), lhs_transpose_pack); + ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), + lhs_transpose_pack); asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get()); } // Run transpose rhs if necessary - if(_adj_rhs) + if (_adj_rhs) { - ITensorPack rhs_transpose_pack = { { TensorType::ACL_SRC, rhs }, { TensorType::ACL_DST, rhs_transposed.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), rhs_transpose_pack); + ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), + rhs_transpose_pack); asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get()); } // Run asm kernel diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h index 475c019fd0..24db3da346 100644 --- a/src/cpu/operators/CpuMatMul.h +++ b/src/cpu/operators/CpuMatMul.h @@ -25,6 +25,7 @@ #define ACL_SRC_CPU_OPERATORS_CPUMATMUL #include "arm_compute/core/TensorInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/kernels/CpuTransposeKernel.h" @@ -66,18 +67,27 @@ public: * @param[in] settings The settings for matmul operation (i.e fast math) * @param[in] act_info Class containing information about fused activation function. */ - void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuMatMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: @@ -91,9 +101,9 @@ private: }; // Define unique pointers to kernels/operators used by matmul - std::unique_ptr _transpose_kernel_lhs{ nullptr }; - std::unique_ptr _transpose_kernel_rhs{ nullptr }; - std::unique_ptr _asm_glue{ nullptr }; + std::unique_ptr _transpose_kernel_lhs{nullptr}; + std::unique_ptr _transpose_kernel_rhs{nullptr}; + std::unique_ptr _asm_glue{nullptr}; // TensorInfo for tensors stored in auxillary memory TensorInfo _lhs_transposed{}; @@ -105,13 +115,13 @@ private: TensorShape _original_dst_shape{}; // Note : adj_lhs means the same as transposing lhs - bool _adj_lhs{ false }; - bool _adj_rhs{ false }; - bool _fast_math{ false }; + bool _adj_lhs{false}; + bool _adj_rhs{false}; + bool _fast_math{false}; AsmGemmInfo _gemm_info{}; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; -} -} +} // namespace cpu +} // namespace arm_compute #endif /* ACL_SRC_CPU_OPERATORS_CPUMATMUL */ diff --git a/src/cpu/operators/CpuMaxUnpooling.cpp b/src/cpu/operators/CpuMaxUnpooling.cpp index 24e9fd6d46..697fc40ab3 100644 --- a/src/cpu/operators/CpuMaxUnpooling.cpp +++ b/src/cpu/operators/CpuMaxUnpooling.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuMaxUnpooling.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" @@ -29,7 +30,10 @@ namespace arm_compute { namespace cpu { -void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info) +void CpuMaxUnpooling::configure(const ITensorInfo *src, + const ITensorInfo *indices, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(src, indices, dst, pool_info); auto k = std::make_unique(); @@ -37,9 +41,12 @@ void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indic _kernel = std::move(k); } -Status CpuMaxUnpooling::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info) +Status CpuMaxUnpooling::validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { return kernels::CpuMaxUnpoolingLayerKernel::validate(src, indices, dst, pool_info); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuMaxUnpooling.h b/src/cpu/operators/CpuMaxUnpooling.h index aa1f1072a5..5dc00bce9e 100644 --- a/src/cpu/operators/CpuMaxUnpooling.h +++ b/src/cpu/operators/CpuMaxUnpooling.h @@ -44,14 +44,18 @@ public: * @param[out] dst Destination tensor. Data types supported: Same as @p src * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ - void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); + void + configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuMaxUnpooling::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info); }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp index 4c15015206..ac9847111d 100644 --- a/src/cpu/operators/CpuMul.cpp +++ b/src/cpu/operators/CpuMul.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuMulKernel.h" @@ -33,14 +34,24 @@ namespace arm_compute { namespace cpu { -Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +Status CpuMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy); } -void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +void CpuMul::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); @@ -58,13 +69,19 @@ void CpuMul::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); } -Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status CpuComplexMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuComplexMulKernel::validate(src1, src2, dst); } -void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void CpuComplexMul::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); @@ -80,4 +97,4 @@ void CpuComplexMul::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h index 3e0edbf050..82b309830b 100644 --- a/src/cpu/operators/CpuMul.h +++ b/src/cpu/operators/CpuMul.h @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -61,7 +62,12 @@ public: * @param[in] rounding_policy Rounding policy. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -69,7 +75,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: @@ -89,14 +100,20 @@ public: * @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuComplexMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp index babaf21b6f..25acc92d00 100644 --- a/src/cpu/operators/CpuPermute.cpp +++ b/src/cpu/operators/CpuPermute.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuPermute.h" -#include "src/cpu/kernels/CpuPermuteKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuPermuteKernel.h" namespace arm_compute { @@ -43,5 +42,5 @@ Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, cons { return kernels::CpuPermuteKernel::validate(src, dst, perm); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp index 722cd36ee5..b72bde6978 100644 --- a/src/cpu/operators/CpuPool2d.cpp +++ b/src/cpu/operators/CpuPool2d.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuPool2dKernel.h" #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" @@ -53,7 +54,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info, indices); // Check if we can run assembly kernels. Currently, indices are not supported by those kernels - const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + const bool run_optimised = + bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); // Get data layout _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; @@ -61,10 +63,11 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer // Check if we have Global Pooling Layer const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height); - _use_kernel_indices = pool_info.use_kernel_indices; + _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && + (src->dimension(idx_height) == pool_info.pool_size.height); + _use_kernel_indices = pool_info.use_kernel_indices; - if(run_optimised) + if (run_optimised) { const CPUInfo &ci = NEScheduler::get().cpu_info(); const unsigned int num_threads = NEScheduler::get().num_threads(); @@ -76,7 +79,7 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer // Get kernel's memory requirements constexpr size_t alignment = 4096; const size_t workspace_size = pooling_wrapper->get_working_size(num_threads); - _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment); + _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment); _asm_glue = std::move(pooling_wrapper); } @@ -89,11 +92,15 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer } } -Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status CpuPool2d::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { - const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + const bool run_optimised = + bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); - if(run_optimised) + if (run_optimised) { return Status{}; } @@ -105,20 +112,24 @@ void CpuPool2d::run(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided"); - if(_asm_glue) + if (_asm_glue) { const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY; NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors); } else { - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors); + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), + _is_global_pooling_layer ? Window::DimZ : Window::DimY, + _pooling_layer_kernel->window(), tensors); break; case DataLayout::NHWC: - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), (_use_kernel_indices ? Window::DimY : Window::DimX), _pooling_layer_kernel->window(), tensors); + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), + (_use_kernel_indices ? Window::DimY : Window::DimX), + _pooling_layer_kernel->window(), tensors); break; default: ARM_COMPUTE_ERROR("Data layout not supported"); diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h index 5c571db88a..ea73e3f335 100644 --- a/src/cpu/operators/CpuPool2d.h +++ b/src/cpu/operators/CpuPool2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_POOL2D_H #include "arm_compute/core/experimental/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -58,17 +59,21 @@ public: * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. */ - void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + void + configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuPool2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp index 14e4ac6c97..7fa78c1f80 100644 --- a/src/cpu/operators/CpuPool3d.cpp +++ b/src/cpu/operators/CpuPool3d.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/Scheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuPool3dKernel.h" @@ -35,8 +36,7 @@ namespace arm_compute { namespace cpu { -CpuPool3d::CpuPool3d() - : _aux_mem(1) +CpuPool3d::CpuPool3d() : _aux_mem(1) { } @@ -70,4 +70,4 @@ experimental::MemoryRequirements CpuPool3d::workspace() const } } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h index 8a73f8a0af..235d798095 100644 --- a/src/cpu/operators/CpuPool3d.h +++ b/src/cpu/operators/CpuPool3d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_POOL3D_H #include "arm_compute/core/experimental/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -61,7 +62,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp index f9e14d1f88..4315499c39 100644 --- a/src/cpu/operators/CpuQuantize.cpp +++ b/src/cpu/operators/CpuQuantize.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuQuantizeKernel.h" diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp index e6892a2e7e..a423abb49a 100644 --- a/src/cpu/operators/CpuReshape.cpp +++ b/src/cpu/operators/CpuReshape.cpp @@ -23,11 +23,10 @@ */ #include "src/cpu/operators/CpuReshape.h" -#include "src/cpu/kernels/CpuReshapeKernel.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/cpu/kernels/CpuReshapeKernel.h" namespace arm_compute { @@ -49,7 +48,7 @@ Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) void CpuReshape::run(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - if(!_is_prepared) + if (!_is_prepared) { static_cast(_kernel.get())->prepare(tensors); _is_prepared = true; diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h index 9bc43e7db4..33da792319 100644 --- a/src/cpu/operators/CpuReshape.h +++ b/src/cpu/operators/CpuReshape.h @@ -24,9 +24,10 @@ #ifndef ARM_COMPUTE_CPU_RESHAPE_H #define ARM_COMPUTE_CPU_RESHAPE_H -#include "src/cpu/ICpuOperator.h" #include "arm_compute/core/Window.h" +#include "src/cpu/ICpuOperator.h" + namespace arm_compute { namespace cpu @@ -53,7 +54,7 @@ public: void run(ITensorPack &tensors) override; private: - bool _is_prepared{ false } ; + bool _is_prepared{false}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp index 8a712bf088..7df9296931 100644 --- a/src/cpu/operators/CpuScale.cpp +++ b/src/cpu/operators/CpuScale.cpp @@ -24,8 +24,9 @@ #include "src/cpu/operators/CpuScale.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/utils/ScaleUtils.h" #include "src/cpu/kernels/CpuScaleKernel.h" @@ -37,11 +38,12 @@ namespace cpu { namespace { -void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners) +void precompute_dx_dy_offsets( + ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners) { ARM_COMPUTE_ERROR_ON(offsets == nullptr); float sampling_offset = 0.0f; - if(sampling_policy == SamplingPolicy::CENTER) + if (sampling_policy == SamplingPolicy::CENTER) { sampling_offset = 0.5f; } @@ -50,38 +52,44 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1)); win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1)); - if(dx != nullptr && dy != nullptr) + if (dx != nullptr && dy != nullptr) { // Pre-compute the offset and pixel's distance for BILINEAR interpolation Iterator offsets_it(offsets, win); Iterator dx_it(dx, win); Iterator dy_it(dy, win); - execute_window_loop(win, [&](const Coordinates & id) - { - const float in_x = (id.x() + sampling_offset) * wr - sampling_offset; - const float in_y = (id.y() + sampling_offset) * hr - sampling_offset; - const int in_xi = std::floor(in_x); - const int in_yi = std::floor(in_y); + execute_window_loop( + win, + [&](const Coordinates &id) + { + const float in_x = (id.x() + sampling_offset) * wr - sampling_offset; + const float in_y = (id.y() + sampling_offset) * hr - sampling_offset; + const int in_xi = std::floor(in_x); + const int in_yi = std::floor(in_y); - *reinterpret_cast(offsets_it.ptr()) = in_xi; - *reinterpret_cast(dx_it.ptr()) = in_x - in_xi; - *reinterpret_cast(dy_it.ptr()) = in_y - in_yi; - }, - offsets_it, dx_it, dy_it); + *reinterpret_cast(offsets_it.ptr()) = in_xi; + *reinterpret_cast(dx_it.ptr()) = in_x - in_xi; + *reinterpret_cast(dy_it.ptr()) = in_y - in_yi; + }, + offsets_it, dx_it, dy_it); } else { // Pre-compute the offset for NEAREST interpolation Iterator offsets_it(offsets, win); - execute_window_loop(win, [&](const Coordinates & id) - { - const float float_in_xi = (id.x() + sampling_offset) * wr; - const auto in_xi = static_cast(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi)); - *reinterpret_cast(offsets_it.ptr()) = in_xi; - }, - offsets_it); + execute_window_loop( + win, + [&](const Coordinates &id) + { + const float float_in_xi = (id.x() + sampling_offset) * wr; + const auto in_xi = static_cast( + align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) + : std::floor(float_in_xi)); + *reinterpret_cast(offsets_it.ptr()) = in_xi; + }, + offsets_it); } } } // namespace @@ -96,20 +104,24 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn _is_prepared = false; // Get data layout and width/height indices - _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + _scale_info.align_corners && + arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), + dst->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), + dst->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f - && hr <= 1.f) ? - InterpolationPolicy::NEAREST_NEIGHBOR : - _scale_info.interpolation_policy; + InterpolationPolicy policy_to_use = + (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : _scale_info.interpolation_policy; // Get the tensor shape TensorShape shape(dst->dimension(idx_width)); @@ -122,7 +134,7 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn auto dy = std::make_unique(tensor_info_dxdy); auto offsets = std::make_unique(tensor_info_offsets); auto scale_kernel = std::make_unique(); - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: { @@ -148,7 +160,8 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT); + ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && + info.sampling_policy != SamplingPolicy::TOP_LEFT); ITensorInfo *offsets = nullptr; ITensorInfo *dx = nullptr; @@ -160,19 +173,25 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), + dst->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), + dst->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy; + InterpolationPolicy policy_to_use = + (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : info.interpolation_policy; // Get the tensor shape of auxilary buffers const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height)); TensorInfo tensor_info_offsets(shape, Format::S32); TensorInfo tensor_info_dx(shape, Format::F32); TensorInfo tensor_info_dy(shape, Format::F32); - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: offsets = &tensor_info_offsets; @@ -186,13 +205,14 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const break; } - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info)); return Status{}; } void CpuScale::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { _is_prepared = true; const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); @@ -206,22 +226,27 @@ void CpuScale::prepare(ITensorPack &tensors) const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + _scale_info.align_corners && + arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio( + src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio( + src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f - && hr <= 1.f) ? - InterpolationPolicy::NEAREST_NEIGHBOR : - _scale_info.interpolation_policy; + InterpolationPolicy policy_to_use = + (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : _scale_info.interpolation_policy; const SamplingPolicy sampling_policy = _scale_info.sampling_policy; - bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(_data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode); + bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required( + _data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode); - if(precompute_indices_weights) + if (precompute_indices_weights) { - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: { @@ -245,7 +270,8 @@ void CpuScale::prepare(ITensorPack &tensors) } else { - if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA) + if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && + policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA) { ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h index ee7c523bad..c12a8e733a 100644 --- a/src/cpu/operators/CpuScale.h +++ b/src/cpu/operators/CpuScale.h @@ -24,9 +24,10 @@ #ifndef ARM_COMPUTE_CPU_SCALE_H #define ARM_COMPUTE_CPU_SCALE_H +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/experimental/Types.h" + #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" @@ -62,9 +63,9 @@ public: void run(ITensorPack &tensors) override; private: - ScaleKernelInfo _scale_info{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - bool _is_prepared{ false }; + ScaleKernelInfo _scale_info{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED}; + DataLayout _data_layout{DataLayout::UNKNOWN}; + bool _is_prepared{false}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp index bf4c2fa3a2..e55d7f903e 100644 --- a/src/cpu/operators/CpuSoftmax.cpp +++ b/src/cpu/operators/CpuSoftmax.cpp @@ -25,9 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" @@ -63,13 +64,15 @@ void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *d ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis)); ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis); - const unsigned int actual_axis = static_cast(wrap_around(axis, static_cast(src->num_dimensions()))); + const unsigned int actual_axis = + static_cast(wrap_around(axis, static_cast(src->num_dimensions()))); _needs_permute = actual_axis > 0; - if(_needs_permute) + if (_needs_permute) { - _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + _permute_input.configure(src, &_input_permuted, + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); } // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case) @@ -79,10 +82,11 @@ void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *d // Create intermediate tensors shapes TensorShape max_sum_shape = tmp_input->tensor_shape(); max_sum_shape.set(0, 1); - const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); - DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type(); - TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); - TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); + const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); + DataType tmp_data_type = + is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type(); + TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); + TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); // Init intermediate tensors _max = TensorInfo(max_info); @@ -94,13 +98,14 @@ void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *d _max_kernel = std::move(mk); auto sm = std::make_unique>(); - if(_needs_permute) + if (_needs_permute) { // The normalization kernel stores the result in a permuted output tensor sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp); // Re-permute the permuted output into the requested (4D) output - _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + _permute_output.configure(&_output_permuted, dst, + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); } else { @@ -109,11 +114,15 @@ void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *d } _softmax_kernel = std::move(sm); - _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size()); - _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); + _aux_mem[InternalTensorIdx::MAX] = + MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size()); + _aux_mem[InternalTensorIdx::TMP] = + MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size()); + _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), + MemoryLifetime::Temporary, _input_permuted.total_size()); + _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), + MemoryLifetime::Temporary, _output_permuted.total_size()); } template @@ -123,7 +132,8 @@ Status CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensor ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported"); ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast(-src->num_dimensions()) || static_cast(src->num_dimensions()) <= axis); + ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast(-src->num_dimensions()) || + static_cast(src->num_dimensions()) <= axis); // Create intermediate tensor info DataType tmp_data_type = src->data_type(); @@ -131,25 +141,33 @@ Status CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensor TensorShape max_sum_shape = src->tensor_shape(); max_sum_shape.set(0, 1); - const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true)); + const TensorInfo tensor_info_max_sum(src->clone() + ->set_tensor_shape(max_sum_shape) + .set_data_type(tmp_data_type) + .set_quantization_info(src->quantization_info()) + .set_is_resizable(true)); const TensorInfo dont_care; - const unsigned int actual_axis = static_cast(wrap_around(axis, static_cast(src->num_dimensions()))); + const unsigned int actual_axis = + static_cast(wrap_around(axis, static_cast(src->num_dimensions()))); const bool needs_permute = actual_axis > 0; - if(needs_permute) + if (needs_permute) { - const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector); - TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape)); + const PermutationVector permutation_vector = + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); + const TensorShape permuted_shape = + misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector); + TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector)); TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector)); } ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel::validate( + &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care)); return Status{}; } @@ -166,43 +184,38 @@ void CpuSoftmaxGeneric::run(ITensorPack &tensors) CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true); CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true); - CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true); + CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, + true); ITensorPack max_pack; ITensorPack softmax_pack; - if(_needs_permute) + if (_needs_permute) { - ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } }; + ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}}; _permute_input.run(permute_in_pack); - max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } }; + max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}}; - softmax_pack = - { - { TensorType::ACL_SRC_0, input_permuted.get() }, - { TensorType::ACL_SRC_1, max.get() }, - { TensorType::ACL_DST_0, output_permuted.get() }, - { TensorType::ACL_DST_1, tmp.get() } - }; + softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()}, + {TensorType::ACL_SRC_1, max.get()}, + {TensorType::ACL_DST_0, output_permuted.get()}, + {TensorType::ACL_DST_1, tmp.get()}}; } else { - max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } }; - - softmax_pack = - { - { TensorType::ACL_SRC_0, src }, - { TensorType::ACL_SRC_1, max.get() }, - { TensorType::ACL_DST_0, dst }, - { TensorType::ACL_DST_1, tmp.get() } - }; + max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}}; + + softmax_pack = {{TensorType::ACL_SRC_0, src}, + {TensorType::ACL_SRC_1, max.get()}, + {TensorType::ACL_DST_0, dst}, + {TensorType::ACL_DST_1, tmp.get()}}; } NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack); NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack); - if(_needs_permute) + if (_needs_permute) { ITensorPack permute_out_pack; permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get()); @@ -211,7 +224,7 @@ void CpuSoftmaxGeneric::run(ITensorPack &tensors) } } -template +template experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const { return _aux_mem; diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h index 64df8704f9..8cab70e14f 100644 --- a/src/cpu/operators/CpuSoftmax.h +++ b/src/cpu/operators/CpuSoftmax.h @@ -24,11 +24,13 @@ #ifndef ARM_COMPUTE_CPU_SOFTMAX_H #define ARM_COMPUTE_CPU_SOFTMAX_H -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/TensorInfo.h" + #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/operators/CpuPermute.h" + #include namespace arm_compute @@ -77,7 +79,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp index 91a5b6e63c..7d27efbc96 100644 --- a/src/cpu/operators/CpuSub.cpp +++ b/src/cpu/operators/CpuSub.cpp @@ -23,17 +23,20 @@ */ #include "src/cpu/operators/CpuSub.h" -#include "src/cpu/kernels/CpuSubKernel.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/cpu/kernels/CpuSubKernel.h" namespace arm_compute { namespace cpu { -void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CpuSub::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy); @@ -42,7 +45,11 @@ void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor _kernel = std::move(k); } -Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CpuSub::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuSubKernel::validate(src0, src1, dst, policy); diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h index 88908637aa..d1782a1d3c 100644 --- a/src/cpu/operators/CpuSub.h +++ b/src/cpu/operators/CpuSub.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_SUB_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -53,14 +54,22 @@ public: * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuSub::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp index 4e7854fd6e..ea548e0511 100644 --- a/src/cpu/operators/CpuTranspose.cpp +++ b/src/cpu/operators/CpuTranspose.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuTranspose.h" -#include "src/cpu/kernels/CpuTransposeKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuTransposeKernel.h" namespace arm_compute { @@ -43,5 +42,5 @@ Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) { return kernels::CpuTransposeKernel::validate(src, dst); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp index c4edd89964..9d07736c13 100644 --- a/src/cpu/operators/CpuWinogradConv2d.cpp +++ b/src/cpu/operators/CpuWinogradConv2d.cpp @@ -22,23 +22,25 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuWinogradConv2d.h" + #include "arm_compute/core/Error.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/kernels/assembly/winograd.hpp" #include "src/core/NEON/kernels/convolution/common/tensor.hpp" #include "src/core/NEON/kernels/convolution/common/utils.hpp" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/AssemblyUtils.h" -#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/operators/CpuActivation.h" #include "src/cpu/operators/CpuPermute.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" @@ -56,21 +58,26 @@ namespace inline Tensor4DShape internal_get_shape(const ITensorInfo *in) { const DataLayout data_layout = in->data_layout(); - const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); - const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); - const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); - const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)); + const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); + const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); + const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); + const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)); - return Tensor4DShape{ in_batches, in_height, in_width, in_channels }; + return Tensor4DShape{in_batches, in_height, in_width, in_channels}; } -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info) { ARM_COMPUTE_UNUSED(dst, weights); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides."); - if(biases != nullptr) + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, + "Winograd layer only supports unit strides."); + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); @@ -80,43 +87,46 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co return Status{}; } -bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math, - arm_conv::winograd::WinogradImpl *winograd_impl, std::unique_ptr &conv_args) +bool get_winograd_kernel_implementation(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + arm_conv::winograd::WinogradImpl *winograd_impl, + std::unique_ptr &conv_args) { arm_conv::winograd::WinogradConfig winograd_cfg; arm_gemm::GemmConfig cfg; const DataType data_type = src->data_type(); - Tensor4DShape in_shape{ internal_get_shape(src) }; - Tensor4DShape out_shape{ internal_get_shape(dst) }; - Tensor4DShape kernel_shape{ internal_get_shape(weights) }; + Tensor4DShape in_shape{internal_get_shape(src)}; + Tensor4DShape out_shape{internal_get_shape(dst)}; + Tensor4DShape kernel_shape{internal_get_shape(weights)}; uint32_t nthreads = NEScheduler::get().num_threads(); // Get configuration arguments for Winograd winograd_cfg.output_rows = 0; winograd_cfg.output_cols = 0; conv_args = std::make_unique( - in_shape.n_batches, - arm_conv::Shape2D{ static_cast(in_shape.n_rows), static_cast(in_shape.n_cols) }, - in_shape.n_channels, - conv_info.pad_top(), - conv_info.pad_left(), - arm_conv::Shape2D{ static_cast(out_shape.n_rows), static_cast(out_shape.n_cols) }, - out_shape.n_channels, - arm_conv::Shape2D{ static_cast(kernel_shape.n_rows), static_cast(kernel_shape.n_cols) }, - assembly_utils::map_to_arm_gemm_activation(act_info)); + in_shape.n_batches, + arm_conv::Shape2D{static_cast(in_shape.n_rows), static_cast(in_shape.n_cols)}, + in_shape.n_channels, conv_info.pad_top(), conv_info.pad_left(), + arm_conv::Shape2D{static_cast(out_shape.n_rows), static_cast(out_shape.n_cols)}, + out_shape.n_channels, + arm_conv::Shape2D{static_cast(kernel_shape.n_rows), static_cast(kernel_shape.n_cols)}, + assembly_utils::map_to_arm_gemm_activation(act_info)); bool success = false; - if(data_type == DataType::F32) + if (data_type == DataType::F32) { - success = arm_conv::winograd::get_implementation( - *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr); + success = arm_conv::winograd::get_implementation(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, + enable_fast_math, &winograd_cfg, nullptr); } #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - else if(data_type == DataType::F16) + else if (data_type == DataType::F16) { - success = arm_conv::winograd::get_implementation<__fp16>( - *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr); + success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, + enable_fast_math, &winograd_cfg, nullptr); } #endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) else @@ -127,7 +137,8 @@ bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInf } inline bool fuse_function_supported(const ActivationLayerInfo &act_info) { - return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU; + return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || + act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU; } } // namespace @@ -141,7 +152,7 @@ CpuWinogradConv2d::CpuWinogradConv2d() _permute_output(std::make_unique()), _permute_weights(std::make_unique()), _aux_mem(AuxTensorIdx::Count), - _conv_args{ nullptr }, + _conv_args{nullptr}, _winograd_impl{}, _data_layout(), _winograd_transformed_input{}, @@ -152,15 +163,20 @@ CpuWinogradConv2d::CpuWinogradConv2d() _weights_hwio(), _input_nhwc(), _output_nhwc(), - _is_prepared{ false }, - _run_activation{ false } + _is_prepared{false}, + _run_activation{false} { } CpuWinogradConv2d::~CpuWinogradConv2d() = default; -void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) +void CpuWinogradConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_ERROR_THROW_ON(validate(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); @@ -169,21 +185,29 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei const DataType data_type = src->data_type(); uint32_t nthreads = NEScheduler::get().num_threads(); _data_layout = src->data_layout(); - const Tensor4DShape kernel_shape{ internal_get_shape(weights) }; - - bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &_winograd_impl, _conv_args); - - ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", _winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", _winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", _winograd_impl.input_transform->get_name().c_str()); - - const bool has_impl = ((_winograd_impl.input_transform != nullptr) && (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr)); - if(has_impl) + const Tensor4DShape kernel_shape{internal_get_shape(weights)}; + + bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, + &_winograd_impl, _conv_args); + + ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, + kernel_shape.n_cols); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + + const bool has_impl = ((_winograd_impl.input_transform != nullptr) && + (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr)); + if (has_impl) { // Determine how much working space is required, allocate it. - const size_t input_workspace_size = _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads); - const size_t output_workspace_size = _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads); + const size_t input_workspace_size = + _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads); + const size_t output_workspace_size = + _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads); TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8); TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8); @@ -232,7 +256,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U); // Configure the kernel to transform the input tensor from NCHW -> NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U)); weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U); @@ -242,28 +266,30 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector); // Reorder the convoluted output to ACL's ordering NCHW - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() - TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), - dst->dimension(1), dst->dimension(3)), - 1, dst->data_type()); + TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), dst->dimension(1), dst->dimension(3)), 1, + dst->data_type()); _output_nhwc = info; _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U)); } // Configure input transform kernel - _transform_input_kernel = std::make_unique(_winograd_impl, *_conv_args, nthreads); + _transform_input_kernel = + std::make_unique(_winograd_impl, *_conv_args, nthreads); // Configure GEMM function - _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, &_winograd_transformed_output, 1.0f, 0.f); + _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, + &_winograd_transformed_output, 1.0f, 0.f); // Configure output transform kernel - _transform_output_kernel = std::make_unique(_winograd_impl, *_conv_args, nthreads); + _transform_output_kernel = + std::make_unique(_winograd_impl, *_conv_args, nthreads); //Configure Activation Layer _run_activation = act_info.enabled() && !fuse_function_supported(act_info); - if(_run_activation) + if (_run_activation) { _activation_func->configure(dst, nullptr, act_info); } @@ -276,40 +302,55 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei _aux_mem[TempResult] = asm_mem_req[TempResult]; // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps. - _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, wds.input_matrix_size_bytes, storage_alignment); - _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, wds.output_matrix_size_bytes, storage_alignment); - _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size)); - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); - _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, wds.weight_matrix_size_bytes, storage_alignment); - if(_data_layout == DataLayout::NCHW) + _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, + wds.input_matrix_size_bytes, storage_alignment); + _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, + wds.output_matrix_size_bytes, storage_alignment); + _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, + std::max(input_workspace_size, output_workspace_size)); + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); + _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, + wds.weight_matrix_size_bytes, storage_alignment); + if (_data_layout == DataLayout::NCHW) { _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size()); _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size()); } } } -Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CpuWinogradConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info)); // Disable winograd for fp16 if fast math is false. - if(!enable_fast_math) + if (!enable_fast_math) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); } - const Tensor4DShape kernel_shape{ internal_get_shape(weights) }; + const Tensor4DShape kernel_shape{internal_get_shape(weights)}; arm_conv::winograd::WinogradImpl winograd_impl{}; std::unique_ptr conv_args; - const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &winograd_impl, conv_args); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", winograd_impl.input_transform->get_name().c_str()); + const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, + &winograd_impl, conv_args); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, + kernel_shape.n_cols); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); return Status{}; } @@ -328,24 +369,29 @@ void CpuWinogradConv2d::run(ITensorPack &tensors) // Wrap the winograd-domain tensorInfos created in configuration in tensors and allocate the required memory. CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true); - CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, tensors, true); + CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, + tensors, true); CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true); const bool is_nchw = _data_layout == DataLayout::NCHW; - if(is_nchw) + if (is_nchw) { //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC - ITensorPack pack{ { ACL_SRC, src }, { ACL_DST, input_nhwc.get() } }; + ITensorPack pack{{ACL_SRC, src}, {ACL_DST, input_nhwc.get()}}; _permute_input->run(pack); } - CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, tensors, true); + CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, + tensors, true); CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true); CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true); - ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : src }, { ACL_DST, winograd_input_transformed.get() }, { ACL_INT, input_workspace.get() } }; + ITensorPack transform_input_pack{{ACL_SRC, is_nchw ? input_nhwc.get() : src}, + {ACL_DST, winograd_input_transformed.get()}, + {ACL_INT, input_workspace.get()}}; NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, win, transform_input_pack); - CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, tensors, true); + CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, + tensors, true); // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs ITensorPack gemm_pack = tensors; @@ -356,30 +402,34 @@ void CpuWinogradConv2d::run(ITensorPack &tensors) _gemm_function->run(gemm_pack); // Output transform - ITensorPack transform_output_pack{ { ACL_SRC_0, winograd_output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : output }, { ACL_SRC_1, biases }, { ACL_INT, output_workspace.get() } }; + ITensorPack transform_output_pack{{ACL_SRC_0, winograd_output_transformed.get()}, + {ACL_DST, is_nchw ? output_nhwc.get() : output}, + {ACL_SRC_1, biases}, + {ACL_INT, output_workspace.get()}}; NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, win, transform_output_pack); - if(is_nchw) + if (is_nchw) { // Reorder the convoluted output to ACL's ordering NCHW - ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, output } }; + ITensorPack pack{{ACL_SRC, output_nhwc.get()}, {ACL_DST, output}}; _permute_output->run(pack); } - if(_run_activation) + if (_run_activation) { - ITensorPack pack{ { ACL_SRC, output }, { ACL_DST, output } }; + ITensorPack pack{{ACL_SRC, output}, {ACL_DST, output}}; _activation_func->run(pack); } } void CpuWinogradConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { - const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); - ITensor *weights_aux = utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(PermutedWeights))); + const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); + ITensor *weights_aux = + utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(PermutedWeights))); CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux); - ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } }; + ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}}; _permute_weights->run(permute_tensors); const int element_size_in_bytes = permuted_weights.get()->info()->element_size(); // Weights were in OHWI format, before being permuted "permuted_weights" to be in HWIO format. @@ -387,31 +437,32 @@ void CpuWinogradConv2d::prepare(ITensorPack &tensors) const unsigned int width_idx = 2; // W in HWIO const unsigned int channel_idx = 1; // I in HWIO - const int permuted_weight_row_stride = permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes; - const int permuted_weight_col_stride = permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes; - const int permuted_weight_channel_stride = permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes; + const int permuted_weight_row_stride = + permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes; + const int permuted_weight_col_stride = + permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes; + const int permuted_weight_channel_stride = + permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes; // Wrap the winograd-domain transformed weight TensorInfo in Auxiliary tensor and allocate the required memory. - ITensor *weights_transf = utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(TransformedWeights))); + ITensor *weights_transf = + utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(TransformedWeights))); ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf); CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf); const void *permuted_weights_ptr; void *win_wght_transf_ptr; - permuted_weights_ptr = reinterpret_cast(permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes()); - win_wght_transf_ptr = reinterpret_cast(winograd_transformed_weights.get()->buffer() + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes()); + permuted_weights_ptr = reinterpret_cast( + permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes()); + win_wght_transf_ptr = + reinterpret_cast(winograd_transformed_weights.get()->buffer() + + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes()); // Prepare Weights _winograd_impl.weight_transform->execute( - *_conv_args, - permuted_weights_ptr, - permuted_weight_row_stride, - permuted_weight_col_stride, - permuted_weight_channel_stride, - win_wght_transf_ptr, - _winograd_impl.winograd_spec, - 0, 1 // Thread 1 of 1 + *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride, + permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1 // Thread 1 of 1 ); ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_1, winograd_transformed_weights.get()); diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h index e0df34e2db..7e1d952462 100644 --- a/src/cpu/operators/CpuWinogradConv2d.h +++ b/src/cpu/operators/CpuWinogradConv2d.h @@ -26,10 +26,11 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" -#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/kernels/assembly/gemm_common.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/operators/CpuActivation.h" #include "src/cpu/operators/CpuGemm.h" #include "src/cpu/operators/CpuPermute.h" @@ -73,7 +74,11 @@ public: * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation * available which may introduce a drop of accuracy as well. Default is false */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d @@ -82,13 +87,17 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: @@ -108,27 +117,28 @@ private: PermutedOutput = TransformedInput, Count = 10 }; - std::unique_ptr _gemm_function; - std::unique_ptr _activation_func; - std::unique_ptr _transform_input_kernel; - std::unique_ptr _transform_output_kernel; - std::unique_ptr _permute_input; - std::unique_ptr _permute_output; - std::unique_ptr _permute_weights; - experimental::MemoryRequirements _aux_mem{ Count }; - std::unique_ptr _conv_args; // Make it unique ptr because this type does not have a default constructor - arm_conv::winograd::WinogradImpl _winograd_impl; - DataLayout _data_layout; - TensorInfo _winograd_transformed_input; - TensorInfo _winograd_transformed_output; - TensorInfo _winograd_transformed_weights; - TensorInfo _input_workspace; - TensorInfo _output_workspace; - TensorInfo _weights_hwio; - TensorInfo _input_nhwc; - TensorInfo _output_nhwc; - bool _is_prepared; - bool _run_activation; + std::unique_ptr _gemm_function; + std::unique_ptr _activation_func; + std::unique_ptr _transform_input_kernel; + std::unique_ptr _transform_output_kernel; + std::unique_ptr _permute_input; + std::unique_ptr _permute_output; + std::unique_ptr _permute_weights; + experimental::MemoryRequirements _aux_mem{Count}; + std::unique_ptr + _conv_args; // Make it unique ptr because this type does not have a default constructor + arm_conv::winograd::WinogradImpl _winograd_impl; + DataLayout _data_layout; + TensorInfo _winograd_transformed_input; + TensorInfo _winograd_transformed_output; + TensorInfo _winograd_transformed_weights; + TensorInfo _input_workspace; + TensorInfo _output_workspace; + TensorInfo _weights_hwio; + TensorInfo _input_nhwc; + TensorInfo _output_nhwc; + bool _is_prepared; + bool _run_activation; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 3069d6b541..343ef21c0b 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -24,12 +24,13 @@ #include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/arm_gemm/utils.hpp" #include "src/core/helpers/MemoryHelpers.h" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" #include "src/core/utils/AssemblyUtils.h" -#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" #include "src/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" #include @@ -53,7 +54,12 @@ namespace * @param[in] num_threads Number of threads to run this method. Must be >= 1 */ template -void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon *gemm_asm, ITensor *dst, const TypeInput *src, int src_ld, int src_multi_stride, unsigned int num_threads) +void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon *gemm_asm, + ITensor *dst, + const TypeInput *src, + int src_ld, + int src_multi_stride, + unsigned int num_threads) { ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr); ARM_COMPUTE_ERROR_ON(num_threads == 0); @@ -61,14 +67,14 @@ void run_parallel_pretranspose_B_array(arm_gemm::GemmCommonget_B_pretranspose_window_size(); std::vector workloads(num_threads); - for(unsigned int t = 0; t < num_threads; ++t) + for (unsigned int t = 0; t < num_threads; ++t) { - workloads[t] = [ = ](const ThreadInfo & info) + workloads[t] = [=](const ThreadInfo &info) { const unsigned int start = (info.thread_id * wsize) / num_threads; const unsigned int end = ((info.thread_id + 1) * wsize) / num_threads; - if(start < end) + if (start < end) { gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, start, end); } @@ -113,7 +119,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen p.sections = 1; p.indirect = false; - if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) + if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) { p.indirect = true; p.sections = b->tensor_shape()[2] * b->tensor_shape()[3]; @@ -125,7 +131,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen } // Update M in case of GEMM3D for output - if(info.depth_output_gemm3d != 0) + if (info.depth_output_gemm3d != 0) { p.M = d->tensor_shape().y() * d->tensor_shape().z(); p.batches = d->tensor_shape().total_size_upper(3) / p.multis; @@ -139,19 +145,24 @@ IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataTyp // Schedule assembly kernel const int granule_threshold = 200; IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); - if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32) + if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32) { scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); } - else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8)) + else if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && + (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || + data_type == DataType::S8)) { //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + scheduling_hint = + IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); } - else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)) + else if (method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && + (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)) { //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + scheduling_hint = + IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); } return scheduling_hint; @@ -175,8 +186,12 @@ public: * @param[in] gemm_info GEMM meta-data * @param[in] os Output stage meta-data. */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::GemmArgs args, + const AsmGemmInfo &gemm_info, const OutputStage &os = {}); /** Set requantization shifts to be used @@ -193,19 +208,20 @@ public: * * @return A tuple with the pointers to the shift and multiplier data respectively */ - std::tuple set_requantize_data(const std::vector &shifts, - const std::vector &multipliers); + std::tuple + set_requantize_data(const std::vector &shifts, const std::vector &multipliers); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; bool is_configured() const override; experimental::MemoryRequirements workspace() const override; bool isVarWeightsKernel() const override { - if(!_gemm_kernel_asm) + if (!_gemm_kernel_asm) return false; - const arm_compute::WeightFormat wf = assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format); + const arm_compute::WeightFormat wf = + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format); return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY; } @@ -229,15 +245,15 @@ private: void prepare_indirect_buffer(ITensorPack &tensors); /** Assembly Gemm kernel */ - std::shared_ptr> _gemm_kernel_asm{ nullptr }; + std::shared_ptr> _gemm_kernel_asm{nullptr}; /** Optimised Arm® Neon™ kernel */ - std::unique_ptr _optimised_kernel{ nullptr }; + std::unique_ptr _optimised_kernel{nullptr}; /** Assembly GEMM workspace tensor info */ TensorInfo _workspace_info{}; /** Pre-transpose tensor info */ TensorInfo _pretranspose_info{}; /** Prepared flag */ - bool _is_prepared{ false }; + bool _is_prepared{false}; /** GEMM meta-data */ AsmGemmInfo _gemm_info{}; /** GEMM kernel description */ @@ -251,26 +267,27 @@ private: /** Indirect buffer */ std::unique_ptr _indirect_arg{}; std::unique_ptr _indirect_buf{}; - std::vector _indirect_pad{}; - arm_gemm::ConvolutionParameters _cp{}; - experimental::MemoryRequirements _aux_mem{ Count }; - bool _B_pretranspose_required{ false }; - bool _is_b_constant{ true }; - bool _is_c_constant{ true }; + std::vector _indirect_pad{}; + arm_gemm::ConvolutionParameters _cp{}; + experimental::MemoryRequirements _aux_mem{Count}; + bool _B_pretranspose_required{false}; + bool _is_b_constant{true}; + bool _is_c_constant{true}; }; template std::tuple -Fallback::set_requantize_data(const std::vector &shifts, const std::vector &multipliers) +Fallback::set_requantize_data(const std::vector &shifts, + const std::vector &multipliers) { _multipliers = multipliers; _shifts = shifts; bool need_left = false; - for(const auto s : _shifts) + for (const auto s : _shifts) { left_shifts.push_back(std::max(-s, int32_t(0))); right_shifts.push_back(std::min(-s, int32_t(0))); - if(s < 0 && !need_left) + if (s < 0 && !need_left) { need_left = true; } @@ -295,32 +312,35 @@ void Fallback::prepare_indirect_buffer(ITens const int multi_size = batch_size * batches; const size_t multi_stride = multi_size / sizeof(TypeInput); - for(int64_t m = 0; m < multis; m++) + for (int64_t m = 0; m < multis; m++) { - for(int64_t b = 0; b < batches; b++) + for (int64_t b = 0; b < batches; b++) { - for(int64_t output_y = 0; output_y < _cp.output_height; output_y++) + for (int64_t output_y = 0; output_y < _cp.output_height; output_y++) { - for(int64_t output_x = 0; output_x < _cp.output_width; output_x++) + for (int64_t output_x = 0; output_x < _cp.output_width; output_x++) { int64_t output_xy = (output_y * _cp.output_width) + output_x; - for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++) + for (int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++) { - for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++) + for (int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++) { int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left; int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top; int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x; int64_t input_xy = (input_y * _cp.input_width) + input_x; - if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) + if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) { - _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data(); + _indirect_buf + .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + _indirect_pad.data(); } else { - _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + _indirect_buf + .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A); } } @@ -332,12 +352,15 @@ void Fallback::prepare_indirect_buffer(ITens } template -void Fallback::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) +void Fallback::configure_indirect(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *d, + const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)); float zeropad = 0.f; - if(is_data_type_quantized(a->data_type())) + if (is_data_type_quantized(a->data_type())) { zeropad = a->quantization_info().uniform().offset; } @@ -350,16 +373,25 @@ void Fallback::configure_indirect(const ITen const int64_t output_width = static_cast(d->tensor_shape()[1]); const int64_t output_height = static_cast(d->tensor_shape()[2]); - _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height, - info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad - }; - - if(info.method == AsmConvMethod::Conv) + _cp = {input_width, + input_height, + input_channels, + kernel_width, + kernel_height, + output_width, + output_height, + info.ps_info.stride().first, + info.ps_info.stride().second, + info.padding_top, + info.padding_left, + zeropad}; + + if (info.method == AsmConvMethod::Conv) { _gemm_kernel_asm->set_convolution_parameters(_cp); } - if(info.method == AsmConvMethod::Indirect) + if (info.method == AsmConvMethod::Indirect) { const unsigned int multis = 1; const unsigned int batches = a->tensor_shape().total_size_upper(3); @@ -372,19 +404,22 @@ void Fallback::configure_indirect(const ITen const int multi_size = batch_size * batches; const size_t multi_stride = multi_size / sizeof(TypeInputPtr); - _indirect_buf = std::unique_ptr(reinterpret_cast(malloc(multi_size * multis))); - _indirect_arg = std::unique_ptr(reinterpret_cast(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); + _indirect_buf = std::unique_ptr( + reinterpret_cast(malloc(multi_size * multis))); + _indirect_arg = std::unique_ptr( + reinterpret_cast(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); _indirect_pad = std::vector(_cp.input_channels, TypeInput(zeropad)); // Set indirect argument int64_t pos = 0; - for(int64_t m = 0; m < multis; m++) + for (int64_t m = 0; m < multis; m++) { - for(int64_t b = 0; b < batches; b++) + for (int64_t b = 0; b < batches; b++) { - for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) + for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) { - (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; + (_indirect_arg.get())[pos++] = + _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; } } } @@ -394,8 +429,12 @@ void Fallback::configure_indirect(const ITen } template -void Fallback::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, +void Fallback::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::GemmArgs args, + const AsmGemmInfo &gemm_info, const OutputStage &os) { ARM_COMPUTE_UNUSED(c); @@ -404,7 +443,7 @@ void Fallback::configure(const ITensorInfo * _is_c_constant = c ? c->are_values_constant() : true; _gemm_kernel_asm = arm_gemm::gemm(args, os); - if(_gemm_kernel_asm == nullptr) + if (_gemm_kernel_asm == nullptr) { //configuration not supported: Leave function unconfigured: return; @@ -419,13 +458,14 @@ void Fallback::configure(const ITensorInfo * const size_t workspace_size = _gemm_kernel_asm->get_working_size(); const unsigned int alignment = 4096; _workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8); - _aux_mem[AsmGemmWorkspace] = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment); + _aux_mem[AsmGemmWorkspace] = + MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment); //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 { const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); - if(window_size < static_cast(args._maxthreads)) + if (window_size < static_cast(args._maxthreads)) { _gemm_kernel_asm->set_nthreads(window_size); } @@ -434,18 +474,19 @@ void Fallback::configure(const ITensorInfo * _optimised_kernel = std::move(acl_gemm_wrapper); _gemm_info = gemm_info; // Check for pre-transposed support - if(_gemm_kernel_asm->B_pretranspose_required()) + if (_gemm_kernel_asm->B_pretranspose_required()) { // Forcing 128-byte alignment (required by 32-bit kernels) const unsigned int alignment = 128; const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size(); _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8); - _aux_mem[Pretranspose] = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment); - _B_pretranspose_required = true; + _aux_mem[Pretranspose] = + MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment); + _B_pretranspose_required = true; } // Handle indirect GEMM convolution - if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) + if (gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) { configure_indirect(a, b, d, gemm_info); } @@ -454,34 +495,39 @@ void Fallback::configure(const ITensorInfo * template void Fallback::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. - if(c && c->info()->data_type() == DataType::S32) + if (c && c->info()->data_type() == DataType::S32) { - _gemm_kernel_asm->set_quantized_bias(reinterpret_cast(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); + _gemm_kernel_asm->set_quantized_bias( + reinterpret_cast(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); } // Pretranspose B if required - if(_gemm_kernel_asm->B_pretranspose_required()) + if (_gemm_kernel_asm->B_pretranspose_required()) { // Fixed format kernels need no pretranspose. - ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); - const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); - const auto in1_ptr = reinterpret_cast(b->buffer() + b->info()->offset_first_element_in_bytes()); - const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); + ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); + const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); + const auto in1_ptr = + reinterpret_cast(b->buffer() + b->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); - run_parallel_pretranspose_B_array(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads()); + run_parallel_pretranspose_B_array(_gemm_kernel_asm.get(), pretranspose.get(), + in1_ptr, ldb, multi_stride_b, + NEScheduler::get().num_threads()); b->mark_as_unused(); } - if(_gemm_info.method == AsmConvMethod::Indirect) + if (_gemm_info.method == AsmConvMethod::Indirect) { prepare_indirect_buffer(tensors); } @@ -526,12 +572,12 @@ void Fallback::run(ITensorPack &tensors) int multi_stride_b = 0; const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size(); - auto in0_ptr = reinterpret_cast(a->buffer() + a->info()->offset_first_element_in_bytes()); + auto in0_ptr = reinterpret_cast(a->buffer() + a->info()->offset_first_element_in_bytes()); const TypeInput *in1_ptr = nullptr; auto out_ptr = reinterpret_cast(d->buffer() + d->info()->offset_first_element_in_bytes()); // Check if B is pre-tranposed and de-reference if not - if(!_gemm_kernel_asm->B_is_pretransposed()) + if (!_gemm_kernel_asm->B_is_pretransposed()) { ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); @@ -539,30 +585,34 @@ void Fallback::run(ITensorPack &tensors) } // If necessary, run pretranspose every time if either weights or biases are non-constant - if((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32)) + if ((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32)) { - if(c && c->info()->data_type() == DataType::S32) + if (c && c->info()->data_type() == DataType::S32) { - _gemm_kernel_asm->set_quantized_bias(reinterpret_cast(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); + _gemm_kernel_asm->set_quantized_bias( + reinterpret_cast(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); } // Pretranspose B if required - if(_B_pretranspose_required) + if (_B_pretranspose_required) { - const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); - const auto b_ptr = reinterpret_cast(b->buffer() + b->info()->offset_first_element_in_bytes()); - const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); + const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); + const auto b_ptr = + reinterpret_cast(b->buffer() + b->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true); ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); - if(_is_b_constant) + if (_is_b_constant) { _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b); } else { - run_parallel_pretranspose_B_array(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads()); + run_parallel_pretranspose_B_array(_gemm_kernel_asm.get(), pretranspose.get(), + b_ptr, ldb, multi_stride_b, + NEScheduler::get().num_threads()); } } } @@ -571,17 +621,17 @@ void Fallback::run(ITensorPack &tensors) // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false); - if(workspace.get()->buffer() != nullptr) + if (workspace.get()->buffer() != nullptr) { _gemm_kernel_asm->set_working_space(reinterpret_cast(workspace.get()->buffer())); const unsigned int split_dim = scheduling_hint.split_dimension(); const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); unsigned int num_threads = NEScheduler::get().num_threads(); - if(window_size < num_threads) + if (window_size < num_threads) { num_threads = window_size; } - if(split_dim != IScheduler::split_dimensions_all) + if (split_dim != IScheduler::split_dimensions_all) { // Make sure the kernel does not expect more threads than we can actually spawn const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim); @@ -595,12 +645,12 @@ void Fallback::run(ITensorPack &tensors) // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. TypeOutput *bias = nullptr; - if(c && c->info()->data_type() != DataType::S32) + if (c && c->info()->data_type() != DataType::S32) { bias = reinterpret_cast(c->buffer() + c->info()->offset_first_element_in_bytes()); } - if(_gemm_info.method == AsmConvMethod::Indirect) + if (_gemm_info.method == AsmConvMethod::Indirect) { in0_ptr = nullptr; lda = 0; @@ -609,18 +659,20 @@ void Fallback::run(ITensorPack &tensors) } // Set gemm parameters - _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, - in1_ptr, ldb, multi_stride_b, - out_ptr, ldd, batch_stride_d, multi_stride_d, - bias, 0); + _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, + ldd, batch_stride_d, multi_stride_d, bias, 0); // Schedule NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); } template void create_arm_gemm(std::unique_ptr &arm_gemm, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::Activation activation, const AsmGemmInfo &info) + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) { Params p = extract_parameters(a, b, d, info); const CPUInfo &ci = NEScheduler::get().cpu_info(); @@ -628,7 +680,8 @@ void create_arm_gemm(std::unique_ptr &arm_ge arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, &cfg); // Create arm_gemm fallback auto fallback = std::make_unique>(); @@ -638,8 +691,12 @@ void create_arm_gemm(std::unique_ptr &arm_ge template void create_arm_gemm_quant(std::unique_ptr &arm_gemm, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::Activation activation, const AsmGemmInfo &info) + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) { ARM_COMPUTE_UNUSED(activation); Params p = extract_parameters(a, b, d, info); @@ -648,7 +705,8 @@ void create_arm_gemm_quant(std::unique_ptr & arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, &cfg); // Create arm_gemm fallback auto fallback = std::make_unique>(); @@ -660,22 +718,20 @@ void create_arm_gemm_quant(std::unique_ptr & const GEMMLowpOutputStageInfo os_info = info.output_stage; arm_gemm::Requantize32 gemm_requant_info{}; - if(os_info.gemmlowp_shifts.size() > 1) + if (os_info.gemmlowp_shifts.size() > 1) { - const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers); - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, - std::get<2>(requantize_data), - std::get<3>(requantize_data), - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); + const auto requantize_data = + fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers); + gemm_requant_info = arm_gemm::Requantize32( + nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, + (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data), + std::get<3>(requantize_data), os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); } else { - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier, - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); + gemm_requant_info = + arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, -os_info.gemmlowp_shift, + os_info.gemmlowp_multiplier, os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); } // Configure fallback @@ -684,13 +740,16 @@ void create_arm_gemm_quant(std::unique_ptr & } } //namespace -CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() - : _arm_gemm(nullptr) +CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() : _arm_gemm(nullptr) { } -Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - const AsmGemmInfo &info) +Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); ARM_COMPUTE_UNUSED(c); @@ -701,53 +760,61 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, info.fixed_format, info.fast_mode, &cfg); - switch(a->data_type()) + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, + info.fixed_format, info.fast_mode, &cfg); + switch (a->data_type()) { case DataType::F32: - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for F32 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for F32 input"); break; #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for U8/QASYMM8 input and U32 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for U8/QASYMM8 input and U32 output"); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for U8 input and U8 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for U8 input and U8 output"); } break; case DataType::S8: case DataType::QASYMM8_SIGNED: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output"); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for S8 input and S8 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for S8 input and S8 output"); } break; #endif /* __aarch64__ */ #if defined(ARM_COMPUTE_ENABLE_BF16) case DataType::BFLOAT16: { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for BFLOAT16 input and F32 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for BFLOAT16 input and F32 output"); break; } #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for F16 input and F16 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for F16 input and F16 output"); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: @@ -759,26 +826,30 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected return Status{}; } -Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) +Status CpuGemmAssemblyDispatch::validate( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) { ARM_COMPUTE_UNUSED(c, info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), "Assembly kernel will not be executed when reshape_b_only_on_first_run is false"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), + "Assembly kernel will not be executed when reshape_b_only_on_first_run is false"); #ifndef __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64"); #endif /* __aarch64__ */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - if(is_data_type_quantized_per_channel(b->data_type())) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S8, DataType::BFLOAT16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, + DataType::BFLOAT16, DataType::F16, DataType::F32); + if (is_data_type_quantized_per_channel(b->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8); } - else if(is_fixed_format_fast_math(info.weight_format)) + else if (is_fixed_format_fast_math(info.weight_format)) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16); @@ -787,22 +858,29 @@ Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, + "Only F32 output supported for F32 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, + "Only F16 output supported for F16 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, + "Only F32 output supported for BFLOAT16 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, + "Only U32 output supported for U8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, + "Only S32 output supported for S8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && + (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32), "Only QASYMM8/S32 output supported for QASYMM8 input"); arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED; const Status ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info); - if((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY) + if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY) { // Correctness check: if the format expected by the kernel is // not "any", make sure that the one found matches the format // intended by the caller. - ARM_COMPUTE_RETURN_ERROR_ON_MSG((expected_weight_format != info.weight_format), - "The format expected by the kernel does not correspond with the one requested by the user."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (expected_weight_format != info.weight_format), + "The format expected by the kernel does not correspond with the one requested by the user."); } return ret; } @@ -813,18 +891,19 @@ bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo return act.type != arm_gemm::Activation::Type::None; } -void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) +void CpuGemmAssemblyDispatch::configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info); //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) + if (!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) { return; } - switch(a->data_type()) + switch (a->data_type()) { case DataType::F32: create_arm_gemm(_arm_gemm, a, b, c, d, act, info); @@ -832,7 +911,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { create_arm_gemm(_arm_gemm, a, b, c, d, act, info); } @@ -843,7 +922,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo break; case DataType::S8: case DataType::QASYMM8_SIGNED: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { create_arm_gemm(_arm_gemm, a, b, c, d, act, info); } diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h index ceb7a3f775..5be39a54c0 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -42,20 +43,20 @@ enum class AsmConvMethod struct AsmGemmInfo { - AsmConvMethod method{ AsmConvMethod::Im2Col }; + AsmConvMethod method{AsmConvMethod::Im2Col}; PadStrideInfo ps_info{}; ActivationLayerInfo activation_info{}; GEMMLowpOutputStageInfo output_stage{}; - bool negated_offsets{ true }; - bool reinterpret_input_as_3d{ false }; - bool depth_output_gemm3d{ false }; - int64_t padding_top{ 0 }; - int64_t padding_left{ 0 }; - float padding_value{ 0.f }; - bool fast_mode{ false }; - bool fixed_format{ false }; - arm_compute::WeightFormat weight_format{ arm_compute::WeightFormat::UNSPECIFIED }; - bool reshape_b_only_on_first_run{ true }; + bool negated_offsets{true}; + bool reinterpret_input_as_3d{false}; + bool depth_output_gemm3d{false}; + int64_t padding_top{0}; + int64_t padding_left{0}; + float padding_value{0.f}; + bool fast_mode{false}; + bool fixed_format{false}; + arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED}; + bool reshape_b_only_on_first_run{true}; }; /** Assembly kernel glue */ @@ -72,12 +73,12 @@ public: class IFallback { public: - virtual void run(ITensorPack &tensors) = 0; - virtual void prepare(ITensorPack &tensors) = 0; - virtual experimental::MemoryRequirements workspace() const = 0; - virtual bool is_configured() const = 0; - virtual bool isVarWeightsKernel() const = 0; - virtual ~IFallback() = default; + virtual void run(ITensorPack &tensors) = 0; + virtual void prepare(ITensorPack &tensors) = 0; + virtual experimental::MemoryRequirements workspace() const = 0; + virtual bool is_configured() const = 0; + virtual bool isVarWeightsKernel() const = 0; + virtual ~IFallback() = default; }; public: @@ -121,7 +122,8 @@ public: * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. * @param[in] info GEMM meta-data */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info); + void configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info); /** Indicates whether or not this function can be used to process the given parameters. * @@ -133,7 +135,11 @@ public: * * @return a status. */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info); /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. * @@ -144,7 +150,12 @@ public: * * @return a status. */ - static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info); + static Status has_opt_impl(arm_compute::WeightFormat &weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info); /** Checks if activation is supported by the gemm assembly dispatcher * * @param[in] activation Activation to check @@ -167,8 +178,8 @@ public: } // Inherited methods overridden: - void prepare(ITensorPack &tensors) override; - void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/utils/CpuAuxTensorHandler.h b/src/cpu/utils/CpuAuxTensorHandler.h index ae1cffb659..e23b88a777 100644 --- a/src/cpu/utils/CpuAuxTensorHandler.h +++ b/src/cpu/utils/CpuAuxTensorHandler.h @@ -39,25 +39,26 @@ namespace cpu class CpuAuxTensorHandler { public: - CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) + CpuAuxTensorHandler( + int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) : _tensor() { - if(info.total_size() == 0) + if (info.total_size() == 0) { return; } _tensor.allocator()->soft_init(info); ITensor *packed_tensor = utils::cast::polymorphic_downcast(pack.get_tensor(slot_id)); - if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) + if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) { - if(!bypass_alloc) + if (!bypass_alloc) { _tensor.allocator()->allocate(); ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor"); } - if(pack_inject) + if (pack_inject) { pack.add_tensor(slot_id, &_tensor); _injected_tensor_pack = &pack; @@ -70,22 +71,21 @@ public: } } - CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor) - : _tensor() + CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor) : _tensor() { _tensor.allocator()->soft_init(info); - if(info.total_size() <= tensor.info()->total_size()) + if (info.total_size() <= tensor.info()->total_size()) { _tensor.allocator()->import_memory(tensor.buffer()); } } - CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete; + CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete; CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete; ~CpuAuxTensorHandler() { - if(_injected_tensor_pack) + if (_injected_tensor_pack) { _injected_tensor_pack->remove_tensor(_injected_slot_id); } @@ -103,9 +103,9 @@ public: private: Tensor _tensor{}; - ITensorPack *_injected_tensor_pack{ nullptr }; - int _injected_slot_id{ TensorType::ACL_UNKNOWN }; + ITensorPack *_injected_tensor_pack{nullptr}; + int _injected_slot_id{TensorType::ACL_UNKNOWN}; }; } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */ diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp index 15a5632d0b..9ca20fa152 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp +++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp @@ -22,14 +22,15 @@ * SOFTWARE. */ #include "ClKernelRuntime.h" + #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/CLUtils.h" #ifdef ACL_INTERNAL_TEST_CKW_IN_DF #include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h" #endif // ACL_INTERNAL_TEST_CKW_IN_DF #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" #include "src/gpu/cl/ClKernelLibrary.h" - #include "support/Cast.h" namespace arm_compute { @@ -43,13 +44,12 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe { // Create kernel from kernel source string opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get(); - _kernel = static_cast(compile_ctx.create_kernel(code.name(), - code.name(), // program name has to be provided to differentiate between different unfusable components' kernels. - // Each program contains exactly one kernel - code.code(), - klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */, - code.build_options().options(), - false /* Is source binary */)); + _kernel = static_cast(compile_ctx.create_kernel( + code.name(), + code.name(), // program name has to be provided to differentiate between different unfusable components' kernels. + // Each program contains exactly one kernel + code.code(), klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */, + code.build_options().options(), false /* Is source binary */)); // Configure execution window IClKernel::configure_internal(code.window()); @@ -63,11 +63,15 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe #ifndef ACL_INTERNAL_TEST_CKW_IN_DF -inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector &cl_images) +inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, + const GpuKernelArgumentInfo &arg, + const ICLTensor *tensor, + const Window &arg_slice, + std::vector &cl_images) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); - switch(arg.type) + switch (arg.type) { case GpuKernelArgumentInfo::Type::Scalar: { @@ -95,9 +99,13 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer } case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: { - const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); + const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * + tensor->info()->dimension(2) * + tensor->info()->dimension(3)); const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; - cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + cl::Image2D tensor_image2d = + create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, + tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); cl_images.push_back(tensor_image2d); _kernel.setArg(idx++, tensor_image2d); break; @@ -111,9 +119,13 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer } case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: { - const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); + const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * + tensor->info()->dimension(2) * + tensor->info()->dimension(3)); const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; - cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + cl::Image2D tensor_image2d = + create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, + tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); cl_images.push_back(tensor_image2d); _kernel.setArg(idx++, tensor_image2d); _kernel.setArg(idx++, static_cast(tensor->info()->strides_in_bytes()[2])); @@ -142,8 +154,9 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer const size_t image_h = tensor->info()->tensor_shape().total_size_upper(1); const size_t image_stride_y = tensor->info()->strides_in_bytes()[1]; - cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), - TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly); + cl::Image2D tensor_image2d = create_image2d_from_buffer( + CLKernelLibrary::get().context(), tensor->cl_buffer(), TensorShape(image_w, image_h), + tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly); cl_images.push_back(tensor_image2d); _kernel.setArg(idx++, tensor_image2d); @@ -170,13 +183,16 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer } #else // ACL_INTERNAL_TEST_CKW_IN_DF -inline void ClKernelRuntime::add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector &cl_images) +inline void ClKernelRuntime::add_kernel_argument(unsigned int &idx, + const GpuKernelArgumentBinding &arg, + const ICLTensor *tensor, + std::vector &cl_images) { - switch(arg.type()) + switch (arg.type()) { case GpuKernelArgumentBinding::Type::TensorStorage: { - switch(arg.tensor_storage_type()) + switch (arg.tensor_storage_type()) { case TensorStorageType::ClBufferUint8Ptr: { @@ -238,7 +254,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com // CLImages created from tensor arguments. Need to be retained until enqueue std::vector cl_images; #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - for(auto id_arg : _arguments) + for (auto id_arg : _arguments) { const auto arg = id_arg.second; auto tensor = utils::cast::polymorphic_downcast(tensors.get_tensor(id_arg.first)); @@ -248,7 +264,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com } #else // ACL_INTERNAL_TEST_CKW_IN_DF - for(const auto &arg : _arguments) + for (const auto &arg : _arguments) { auto tensor = utils::cast::polymorphic_downcast(tensors.get_tensor(arg.id())); ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); @@ -259,8 +275,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com // Dispatch kernel enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items); - } - while(skip_sliding_window && window.slide_window_slice_3D(slice)); + } while (skip_sliding_window && window.slide_window_slice_3D(slice)); } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h index 92e73503ce..e78567eb9d 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h +++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h @@ -68,7 +68,11 @@ private: * @param[in] arg_slice Window the kernel will be run on * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued) */ - inline void add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector &cl_images); + inline void add_tensor_argument(unsigned int &idx, + const GpuKernelArgumentInfo &arg, + const ICLTensor *tensor, + const Window &arg_slice, + std::vector &cl_images); #else // ACL_INTERNAL_TEST_CKW_IN_DF /** Set a kernel argument as part of a tensor * @@ -77,7 +81,10 @@ private: * @param[in] tensor Tensor of which the kernel argument @p arg is a part of * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued) */ - inline void add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector &cl_images); + inline void add_kernel_argument(unsigned int &idx, + const GpuKernelArgumentBinding &arg, + const ICLTensor *tensor, + std::vector &cl_images); #endif // ACL_INTERNAL_TEST_CKW_IN_DF private: diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp index cd21b10180..ba39ff4c9d 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp +++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" + #include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" @@ -55,14 +56,14 @@ public: { DataView() = default; DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info) - : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info } + : tensor{tensor}, tensor_info{tensor_info}, memory_info{memory_info} { } - ~DataView() = default; - DataView(const DataView &other) = default; + ~DataView() = default; + DataView(const DataView &other) = default; DataView &operator=(const DataView &other) = default; DataView(DataView &&other) = default; - DataView &operator=(DataView &&other) = default; + DataView &operator=(DataView &&other) = default; CLTensor *tensor{}; /**< Pointer to the auxiliary tensor */ TensorInfo tensor_info{}; /**< Associated tensor info */ AuxMemoryInfo memory_info{}; /**< Memory requirement */ @@ -92,7 +93,7 @@ private: { const auto t_id = tensor_info.id(); auto find_tensor_pair = _owned_tensors.find(t_id); - if(find_tensor_pair != _owned_tensors.end()) + if (find_tensor_pair != _owned_tensors.end()) { return find_tensor_pair->second.get(); } @@ -107,7 +108,7 @@ private: } std::map> _owned_tensors{}; - std::vector _tensors{}; + std::vector _tensors{}; }; /** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode * @@ -120,12 +121,12 @@ private: */ Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code) { - for(auto t_id : code.tensors()) + for (auto t_id : code.tensors()) { // Get tensor object const auto workload_arg = code.query_tensor(t_id); ICLTensor *tensor_object = nullptr; - if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary) + if (workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary) { // Create aux tensor CLTensor object const TensorInfo tensor_info = *workload_arg->tensor_info(); @@ -133,7 +134,7 @@ Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info; tensor_object = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info); - if(tensor_object == nullptr) + if (tensor_object == nullptr) { return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor"); } @@ -156,7 +157,7 @@ public: ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id) { auto tensor_pack = _tensor_packs.find(uwk_id); - if(tensor_pack != _tensor_packs.end()) + if (tensor_pack != _tensor_packs.end()) { return &(tensor_pack->second); } @@ -173,7 +174,10 @@ public: return _tensor_packs.at(uwk_id); } - friend Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector &user_tensors, const ClAuxTensors &aux_tensors); + friend Status create_tensor_lut(ClTensorLUT *tensor_lut, + const GpuWorkloadSourceCode &code, + const std::vector &user_tensors, + const ClAuxTensors &aux_tensors); private: /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id @@ -197,19 +201,22 @@ private: * * @return Status */ -Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector &user_tensors, const ClAuxTensors &aux_tensors) +Status create_tensor_lut(ClTensorLUT *tensor_lut, + const GpuWorkloadSourceCode &code, + const std::vector &user_tensors, + const ClAuxTensors &aux_tensors) { // Combine user tensors and aux tensors std::map tensor_map; - for(auto tensor : user_tensors) + for (auto tensor : user_tensors) { const auto t_id = tensor->info()->id(); - if(tensor_map.find(t_id) != tensor_map.end()) + if (tensor_map.find(t_id) != tensor_map.end()) { // In case of elementwise in-place: give another Id to the In/Out tensor when passed again std::vector ids; - for(auto &t : tensor_map) + for (auto &t : tensor_map) { ids.push_back(t.first); } @@ -221,11 +228,11 @@ Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &c tensor_map[t_id] = tensor; } } - for(const auto &data : aux_tensors.get_tensors()) + for (const auto &data : aux_tensors.get_tensors()) { const auto t_id = data.tensor_info.id(); const auto tensor = data.tensor; - if(tensor_map.find(t_id) != tensor_map.end()) + if (tensor_map.find(t_id) != tensor_map.end()) { return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids"); } @@ -233,25 +240,25 @@ Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &c } // Add tensor objects into corresponding tensor packs - for(auto id_tensor : tensor_map) + for (auto id_tensor : tensor_map) { const auto t_id = id_tensor.first; const auto tensor_object = id_tensor.second; - if(tensor_object == nullptr) + if (tensor_object == nullptr) { return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs"); } - if(tensor_object->allocator()->info().total_size() == 0U) + if (tensor_object->allocator()->info().total_size() == 0U) { return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor"); } - for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id)) + for (auto uwk_id : code.get_unit_workloads_from_tensor(t_id)) { ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id); - if(tensor_pack == nullptr) + if (tensor_pack == nullptr) { - tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } }); + tensor_lut->add_tensor_pack(uwk_id, ITensorPack{{t_id, tensor_object}}); } else { @@ -269,15 +276,14 @@ struct ClWorkloadRuntime::Implementation { std::map> _kernels{}; std::map> _kernels_prep{}; - bool _is_configured{ false }; - bool _is_prepared{ false }; - ClTensorLUT _tensor_lut{}; - ClAuxTensors _aux_tensors{}; - GpuWorkloadSourceCode _source_code{}; + bool _is_configured{false}; + bool _is_prepared{false}; + ClTensorLUT _tensor_lut{}; + ClAuxTensors _aux_tensors{}; + GpuWorkloadSourceCode _source_code{}; }; -ClWorkloadRuntime::ClWorkloadRuntime() - : _impl{ std::make_unique() } +ClWorkloadRuntime::ClWorkloadRuntime() : _impl{std::make_unique()} { } @@ -286,18 +292,19 @@ ClWorkloadRuntime::~ClWorkloadRuntime() = default; Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, + "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch"); // Generate source code _impl->_source_code = sketch.implementation().generate_source_code(); // Configure unit workload from source code - for(auto uwk_id : _impl->_source_code.unit_workloads()) + for (auto uwk_id : _impl->_source_code.unit_workloads()) { const auto work = _impl->_source_code.query_unit_workload(uwk_id); const auto stage = work.stage().stage; auto k = std::make_unique(); k->configure(*sketch.gpu_context()->cl_compile_context(), work.code()); - switch(stage) + switch (stage) { case UnitWorkloadStage::Stage::Run: { @@ -323,9 +330,9 @@ Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch) void ClWorkloadRuntime::prepare() { - if(!_impl->_is_prepared) + if (!_impl->_is_prepared) { - for(auto &id_kernel_pair : _impl->_kernels_prep) + for (auto &id_kernel_pair : _impl->_kernels_prep) { const bool flush_queue = false; const auto uwk_id = id_kernel_pair.first; @@ -344,7 +351,7 @@ Status ClWorkloadRuntime::run(const std::vector &tensors) const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors); ARM_COMPUTE_RETURN_ON_ERROR(st); prepare(); - for(auto &id_kernel_pair : _impl->_kernels) + for (auto &id_kernel_pair : _impl->_kernels) { // Flush the command queue on the last kernel const bool flush_queue = false; @@ -358,7 +365,7 @@ Status ClWorkloadRuntime::run(const std::vector &tensors) std::vector> ClWorkloadRuntime::get_auxiliary_tensors() { std::vector> aux_tensors; - for(const auto &data : _impl->_aux_tensors.get_tensors()) + for (const auto &data : _impl->_aux_tensors.get_tensors()) { aux_tensors.emplace_back(data.tensor, data.tensor_info, data.memory_info); } diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp index 84fb279237..7044b0ea66 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp +++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp @@ -30,14 +30,17 @@ namespace experimental { namespace dynamic_fusion { -void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component) +void cl_add_tensor_component_argument(cl::Kernel &kernel, + unsigned int &idx, + const ICLTensor *tensor, + TensorComponentType component) { ARM_COMPUTE_ERROR_ON(tensor == nullptr); const auto *info = tensor->info(); const auto &strides = info->strides_in_bytes(); - switch(component) + switch (component) { case TensorComponentType::OffsetFirstElement: kernel.setArg(idx++, info->offset_first_element_in_bytes()); diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h index 4cbb157a48..306d547acb 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h +++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h @@ -42,7 +42,10 @@ namespace dynamic_fusion * @param[in] tensor Tensor from which to access the tensor component. * @param[in] component Tensor component to select such as tensor dimensions, strides, etc. */ -void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component); +void cl_add_tensor_component_argument(cl::Kernel &kernel, + unsigned int &idx, + const ICLTensor *tensor, + TensorComponentType component); /** Add an OpenCL buffer object to the kernel's arguments at the specified index @p idx. * diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h index f118d7d851..3bf380b1ec 100644 --- a/src/dynamic_fusion/sketch/ArgumentPack.h +++ b/src/dynamic_fusion/sketch/ArgumentPack.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK #include "arm_compute/core/experimental/Types.h" + #include #include @@ -52,26 +53,21 @@ public: */ struct PackElement { - PackElement() = default; - PackElement(const PackElement &elem) = default; + PackElement() = default; + PackElement(const PackElement &elem) = default; PackElement &operator=(const PackElement &elem) = default; PackElement(PackElement &&elem) = default; - PackElement &operator=(PackElement &&elem) = default; - PackElement(Id id, T *tensor) - : id(id), tensor(tensor), ctensor(nullptr) + PackElement &operator=(PackElement &&elem) = default; + PackElement(Id id, T *tensor) : id(id), tensor(tensor), ctensor(nullptr) { } - PackElement(Id id, const T *ctensor) - : id(id), tensor(nullptr), ctensor(ctensor) + PackElement(Id id, const T *ctensor) : id(id), tensor(nullptr), ctensor(ctensor) { } - Id id{ ACL_UNKNOWN }; /**< Argument id within the pack */ - T *tensor{ nullptr }; /**< Non-const pointer to tensor-related object */ - const T *ctensor - { - nullptr - }; /**< Const pointer to tensor-related object */ + Id id{ACL_UNKNOWN}; /**< Argument id within the pack */ + T *tensor{nullptr}; /**< Non-const pointer to tensor-related object */ + const T *ctensor{nullptr}; /**< Const pointer to tensor-related object */ }; public: @@ -88,10 +84,9 @@ public: /** Allow instances of this class to be moved */ ArgumentPack &operator=(ArgumentPack &&other) = default; /** Initializer list Constructor */ - ArgumentPack(const std::initializer_list &l) - : _pack{} + ArgumentPack(const std::initializer_list &l) : _pack{} { - for(const auto &e : l) + for (const auto &e : l) { _pack[e.id] = e; } @@ -134,7 +129,7 @@ public: const T *get_const_tensor(Id id) const { auto it = _pack.find(id); - if(it != _pack.end()) + if (it != _pack.end()) { return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor; } @@ -171,10 +166,10 @@ public: std::vector get_src_tensors() { std::vector src_tensors{}; - for(int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) + for (int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) { auto tensor = get_tensor(static_cast(id)); - if(tensor != nullptr) + if (tensor != nullptr) { src_tensors.push_back(tensor); } @@ -188,10 +183,10 @@ public: std::vector get_const_src_tensors() const { std::vector src_tensors{}; - for(int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) + for (int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) { auto tensor = get_const_tensor(static_cast(id)); - if(tensor != nullptr) + if (tensor != nullptr) { src_tensors.push_back(tensor); } @@ -205,10 +200,10 @@ public: std::vector get_dst_tensors() { std::vector dst_tensors{}; - for(int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) + for (int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) { auto tensor = get_tensor(static_cast(id)); - if(tensor != nullptr) + if (tensor != nullptr) { dst_tensors.push_back(tensor); } @@ -222,10 +217,10 @@ public: std::vector get_const_dst_tensors() const { std::vector dst_tensors{}; - for(int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) + for (int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) { auto tensor = get_const_tensor(static_cast(id)); - if(tensor != nullptr) + if (tensor != nullptr) { dst_tensors.push_back(tensor); } diff --git a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp index 3a5657e07b..6f3816568c 100644 --- a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp +++ b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp @@ -69,7 +69,8 @@ uint32_t DepthwiseConv2dAttributes::depth_multiplier() const return _depth_multiplier; } -DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type) +DepthwiseConv2dAttributes & +DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type) { _dimension_rounding_type = dimension_rounding_type; return *this; diff --git a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp index c28791f5fe..80f65f926a 100644 --- a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp +++ b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h" + #include "arm_compute/core/Size2D.h" namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h index 226e1a2df3..03817173f4 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h @@ -61,11 +61,10 @@ struct GpuKernelArgumentInfo /** Default constructor */ GpuKernelArgumentInfo() = default; /** Constructor */ - GpuKernelArgumentInfo(Type type) - : type{ type } + GpuKernelArgumentInfo(Type type) : type{type} { } - Type type{ Type::Tensor_4D_t_Buffer }; + Type type{Type::Tensor_4D_t_Buffer}; }; bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1); /** Kernel argument information linked with its corresponding @ref ITensorInfo @@ -79,10 +78,8 @@ public: * @param[in] tensor_info Associated @ref ITensorInfo * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo */ - GpuKernelArgument(const ITensorInfo &tensor_info, - const GpuKernelArgumentInfo &kernel_arg_info) - : _tensor_info{ tensor_info }, - _kernel_arg_info{ kernel_arg_info } + GpuKernelArgument(const ITensorInfo &tensor_info, const GpuKernelArgumentInfo &kernel_arg_info) + : _tensor_info{tensor_info}, _kernel_arg_info{kernel_arg_info} { } /** Get workload tensor id */ @@ -200,12 +197,12 @@ public: TensorComponent /** @ref TensorComponentType */ }; GpuKernelArgumentBinding(ITensorInfo::Id id, TensorStorageType storage) - : _type{ Type::TensorStorage }, _id{ id }, _value{} + : _type{Type::TensorStorage}, _id{id}, _value{} { _value.tensor_storage_type = storage; } GpuKernelArgumentBinding(ITensorInfo::Id id, TensorComponentType component) - : _type{ Type::TensorComponent }, _id{ id }, _value{} + : _type{Type::TensorComponent}, _id{id}, _value{} { _value.tensor_component_type = component; } diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp index 5a65ede38b..1a458c9862 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp @@ -31,35 +31,31 @@ namespace experimental { namespace dynamic_fusion { -std::vector GpuKernelComponentGraph::get_tensor_ids(const std::vector tensors) +std::vector +GpuKernelComponentGraph::get_tensor_ids(const std::vector tensors) { std::vector tensor_ids{}; - std::transform( - std::begin(tensors), std::end(tensors), - std::back_inserter(tensor_ids), - [](const auto & t) - { - return t->id(); - }); + std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids), + [](const auto &t) { return t->id(); }); return tensor_ids; } GpuKernelComponentGraph::GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services) - : _context{ context }, _services{ services }, _components{}, _tensors{}, _dependency_graph{} + : _context{context}, _services{services}, _components{}, _tensors{}, _dependency_graph{} { } GpuKernelComponentStream GpuKernelComponentGraph::fuse(const MemoryDescriptorMap &mem_map) const { - GpuKernelComponentStream stream{ _context, _services, mem_map }; + GpuKernelComponentStream stream{_context, _services, mem_map}; const auto op_seq = _dependency_graph.build_operators_sequence(); stream.new_component_group(); - for(auto op : op_seq) + for (auto op : op_seq) { const auto component = _components.at(op.op).get(); const auto success = stream.add_component(component); - if(!success) // Assume first failure was because the root component is unfusable + if (!success) // Assume first failure was because the root component is unfusable { stream.new_component_group(); const auto success = stream.add_component(component); diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h index 85c9b45840..6f871a3c90 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h @@ -70,21 +70,21 @@ public: * @param[in] args Component arguments except for component id, which is auto-allocated */ template - void add_new_component(Args &&... args) + void add_new_component(Args &&...args) { - auto comp = _services->component_factory().create(std::forward(args)...); - ArgumentPack tensors = comp->tensors(); + auto comp = _services->component_factory().create(std::forward(args)...); + ArgumentPack tensors = comp->tensors(); const auto src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors()); const auto dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors()); - bool success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids); + bool success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids); ARM_COMPUTE_UNUSED(success); ARM_COMPUTE_ERROR_ON(!success); _components[comp->id()] = std::move(comp); - for(auto t : tensors.get_const_src_tensors()) + for (auto t : tensors.get_const_src_tensors()) { _tensors[t->id()] = t; } - for(auto t : tensors.get_const_dst_tensors()) + for (auto t : tensors.get_const_dst_tensors()) { _tensors[t->id()] = t; } @@ -99,11 +99,11 @@ public: private: static std::vector get_tensor_ids(const std::vector tensors); - GpuWorkloadContext *_context; - GpuComponentServices *_services; + GpuWorkloadContext *_context; + GpuComponentServices *_services; std::map> _components; std::map _tensors; - DependencyGraph _dependency_graph{}; + DependencyGraph _dependency_graph{}; }; } // namespace dynamic_fusion } // namespace experimental diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp index 81c3f0c800..5a6d125d96 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Validate.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" #include @@ -37,86 +38,87 @@ namespace dynamic_fusion { bool GpuKernelComponentGroup::add_component(ComponentPtr component) { - ARM_COMPUTE_ERROR_ON_MSG( - _finalized, "The component group has been finalized and cannot be altered."); + ARM_COMPUTE_ERROR_ON_MSG(_finalized, "The component group has been finalized and cannot be altered."); // note: Constraint 1 is guaranteed as a precondition // Constraint 2 - if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components) + if (component->type() != GpuComponentType::Output && _components.size() >= max_fused_components) { return false; } // Constraint 3.1: Pattern: (Unfusable + Output) - if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output) + if (!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && + component->type() != GpuComponentType::Output) { return false; } // Constraint 3.2 - if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output)) + if (!_components.empty() && + (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output)) { return false; } // Constraint 4 - if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U) + if (component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U) { return false; } // Constraint 5 - if(!_components.empty() && !(get_root_component()->properties() == component->properties())) + if (!_components.empty() && !(get_root_component()->properties() == component->properties())) { return false; } // Constraint 7 - if(!_components.empty()) + if (!_components.empty()) { const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors(); ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); const auto first_dst_tensor = root_dst_tensors[0]; const auto dst_tensors = component->tensors().get_const_dst_tensors(); - for(const auto &t : root_dst_tensors) + for (const auto &t : root_dst_tensors) { - if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) { return false; } } - for(const auto &t : dst_tensors) + for (const auto &t : dst_tensors) { - if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) { return false; } } } // Constraint 8 - if(!_components.empty()) + if (!_components.empty()) { const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors(); ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout(); const auto dst_tensors = component->tensors().get_const_dst_tensors(); - for(const auto &t : root_dst_tensors) + for (const auto &t : root_dst_tensors) { - if(t->data_layout() != first_dst_tensor_layout) + if (t->data_layout() != first_dst_tensor_layout) { return false; } } - for(const auto &t : dst_tensors) + for (const auto &t : dst_tensors) { - if(t->data_layout() != first_dst_tensor_layout) + if (t->data_layout() != first_dst_tensor_layout) { return false; } } } // Constraint 9 - if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors) + if (component->tensors().get_const_dst_tensors().size() >= max_dst_tensors) { return false; } // Constraint 9 corollary - if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors) + if (component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors) { return false; } @@ -126,36 +128,36 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component) void GpuKernelComponentGroup::finalize() { - if(_finalized) + if (_finalized) { return; } _finalized = true; - std::set output_tensors; + std::set output_tensors; std::map> possible_tile_map; - std::map tile_usages; + std::map tile_usages; - for(auto component : _components) + for (auto component : _components) { - const auto tensors = component->tensors(); + const auto tensors = component->tensors(); const auto src_tensors = tensors.get_const_src_tensors(); const auto dst_tensors = tensors.get_const_dst_tensors(); // Detect input, output and intermediate tensors. - for(auto tensor : src_tensors) + for (auto tensor : src_tensors) { const auto output_tensors_it = output_tensors.find(tensor); - if(output_tensors_it != output_tensors.end()) + if (output_tensors_it != output_tensors.end()) { // This tensor is the output of another operator. // It must be marked as intermediate tensor. output_tensors.erase(output_tensors_it); _interm_tensors.insert(tensor); } - else if(_interm_tensors.find(tensor) == _interm_tensors.end()) + else if (_interm_tensors.find(tensor) == _interm_tensors.end()) { _input_tensors.insert(tensor); @@ -164,7 +166,7 @@ void GpuKernelComponentGroup::finalize() } } - for(auto tensor : dst_tensors) + for (auto tensor : dst_tensors) { ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end()); ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end()); @@ -177,27 +179,27 @@ void GpuKernelComponentGroup::finalize() // Check if the output can overwrite the input tile. const auto component_type = component->type(); - if(component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output) + if (component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output) { ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1); - const auto dst_tensor = dst_tensors[0]; - const auto &dst_shape = dst_tensor->tensor_shape(); - const auto &dst_type = dst_tensor->data_type(); + const auto dst_tensor = dst_tensors[0]; + const auto &dst_shape = dst_tensor->tensor_shape(); + const auto &dst_type = dst_tensor->data_type(); tile_usages[dst_tensor] = 0; - for(auto src_tensor : src_tensors) + for (auto src_tensor : src_tensors) { const auto &src_shape = src_tensor->tensor_shape(); - const auto &src_type = src_tensor->data_type(); + const auto &src_type = src_tensor->data_type(); - if(src_shape == dst_shape && src_type == dst_type) + if (src_shape == dst_shape && src_type == dst_type) { const auto tile_usages_it = tile_usages.find(src_tensor); ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end()); - if(component_type == GpuComponentType::Simple || tile_usages_it->second > 0) + if (component_type == GpuComponentType::Simple || tile_usages_it->second > 0) { // Increase the number of tile usages unless this component is an output // and the tile has not been shared with any component. @@ -212,7 +214,7 @@ void GpuKernelComponentGroup::finalize() else { // Outputs of complex and unfusable components need dedicated tile. - for(auto tensor : dst_tensors) + for (auto tensor : dst_tensors) { tile_usages[tensor] = 0; } @@ -220,25 +222,25 @@ void GpuKernelComponentGroup::finalize() } // Find the smallest list of tiles that the intermediate tensors need to write to. - for(auto tensor : _input_tensors) + for (auto tensor : _input_tensors) { _tile_map[tensor] = tensor; } - for(auto component : _components) + for (auto component : _components) { const auto dst_tensors = component->tensors().get_const_dst_tensors(); - for(auto tensor : dst_tensors) + for (auto tensor : dst_tensors) { const auto target_tiles = possible_tile_map.at(tensor); - _tile_map[tensor] = tensor; + _tile_map[tensor] = tensor; - for(auto target : target_tiles) + for (auto target : target_tiles) { const auto num_usage = tile_usages[target]; - if(num_usage <= 1) + if (num_usage <= 1) { // The target tile is consumed by only this operator, so we can reuse it // for the destination tensor data. @@ -249,26 +251,23 @@ void GpuKernelComponentGroup::finalize() } } - for(auto tensor : output_tensors) + for (auto tensor : output_tensors) { _tile_map[tensor] = tensor; } // All intermediate tensors that cannot be shared with any previous tensor // will need to be declared as tile variable. - for(auto tensor_tile : _tile_map) + for (auto tensor_tile : _tile_map) { - if(tensor_tile.first == tensor_tile.second && - _interm_tensors.find(tensor_tile.first) != _interm_tensors.end()) + if (tensor_tile.first == tensor_tile.second && _interm_tensors.find(tensor_tile.first) != _interm_tensors.end()) { _tiles.push_back(tensor_tile.first); } } - std::set_union( - _input_tensors.begin(), _input_tensors.end(), - output_tensors.begin(), output_tensors.end(), - std::back_inserter(_argument_tensors)); + std::set_union(_input_tensors.begin(), _input_tensors.end(), output_tensors.begin(), output_tensors.end(), + std::back_inserter(_argument_tensors)); _any_output_tensor = *output_tensors.begin(); } @@ -282,7 +281,7 @@ const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInf { ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); - if(_tile_map.find(tensor) != _tile_map.end()) + if (_tile_map.find(tensor) != _tile_map.end()) { return _tile_map.at(tensor); } @@ -304,7 +303,7 @@ std::vector GpuKernelComponentGroup::get_argument_tensors() GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const { - if(empty()) + if (empty()) { return nullptr; } diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h index c939aec369..6ad71abb39 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h @@ -25,12 +25,11 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP #include "components/Types.h" - #include #include -#include -#include #include +#include +#include namespace arm_compute { @@ -129,9 +128,9 @@ public: /** Get the number of components within the group */ size_t size() const; /** Check if the component group is empty */ - bool empty() const; - ComponentPtr &operator[](size_t index); - const ComponentPtr &operator[](size_t index) const; + bool empty() const; + ComponentPtr &operator[](size_t index); + const ComponentPtr &operator[](size_t index) const; typename std::vector::iterator begin(); typename std::vector::iterator end(); typename std::vector::const_iterator begin() const; @@ -142,13 +141,13 @@ public: private: std::vector _components{}; - bool _finalized{ false }; + bool _finalized{false}; - std::vector _argument_tensors{}; - std::set _input_tensors{}; - std::set _interm_tensors{}; - const ITensorInfo *_any_output_tensor{ nullptr }; - std::vector _tiles{}; + std::vector _argument_tensors{}; + std::set _input_tensors{}; + std::set _interm_tensors{}; + const ITensorInfo *_any_output_tensor{nullptr}; + std::vector _tiles{}; std::map _tile_map{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp index a2b6623370..8042e3dd08 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp @@ -23,9 +23,9 @@ */ #include "GpuKernelComponentStream.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" #include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" -#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute { @@ -33,8 +33,10 @@ namespace experimental { namespace dynamic_fusion { -GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map) - : _context{ context }, _services{ services }, _component_groups{}, _mem_map{ mem_map } +GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext *context, + GpuComponentServices *services, + const MemoryDescriptorMap &mem_map) + : _context{context}, _services{services}, _component_groups{}, _mem_map{mem_map} { } @@ -42,7 +44,7 @@ GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code() { GpuWorkloadSourceCode source_code; // Traverse through component groups and assemble workload together - for(auto && group : _component_groups) + for (auto &&group : _component_groups) { group.finalize(); diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h index ba2503a938..ef8a8a15b0 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" @@ -53,7 +54,9 @@ public: * @param[in] services @ref GpuComponentServices to be used throughout the stream * @param[in] mem_map @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode */ - GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map); + GpuKernelComponentStream(GpuWorkloadContext *context, + GpuComponentServices *services, + const MemoryDescriptorMap &mem_map); /** Allow instances of this class to be copy constructed */ GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default; /** Allow instances of this class to be copied */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h index 64e1cdc3bc..24812cd8a7 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/Window.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" #ifndef ACL_INTERNAL_TEST_CKW_IN_DF diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp index c99984fc0e..502ceab807 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp @@ -26,9 +26,9 @@ #include "arm_compute/core/experimental/Types.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" -#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" #ifndef ACL_INTERNAL_TEST_CKW_IN_DF #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h" #else // ACL_INTERNAL_TEST_CKW_IN_DF @@ -42,7 +42,7 @@ namespace experimental namespace dynamic_fusion { GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components) - : _comp_group{ components }, _store_components{} + : _comp_group{components}, _store_components{} { ARM_COMPUTE_UNUSED(services); } @@ -51,9 +51,9 @@ GpuKernelSourceCode GpuLogicalKernel::write_kernel_code() { GpuKernelSourceCode code; #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - ClTemplateWriter writer { _comp_group }; + ClTemplateWriter writer{_comp_group}; #else // ACL_INTERNAL_TEST_CKW_IN_DF - GpuCkwDriver writer { _comp_group }; + GpuCkwDriver writer{_comp_group}; #endif // ACL_INTERNAL_TEST_CKW_IN_DF code.name(writer.get_name()); diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp index 7bb14c8698..aec8b9db4f 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp @@ -36,20 +36,15 @@ namespace std::vector get_tensor_ids(const std::vector tensors) { std::vector tensor_ids{}; - std::transform( - std::begin(tensors), std::end(tensors), - std::back_inserter(tensor_ids), - [](const auto & t) - { - return t->id(); - }); + std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids), + [](const auto &t) { return t->id(); }); return tensor_ids; } } // namespace Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack &tensors) - : _id{ id }, _operator_type{ operator_type }, _tensors{ tensors } + : _id{id}, _operator_type{operator_type}, _tensors{tensors} { } @@ -73,69 +68,69 @@ bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) cons const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); // Constraint 1 - if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output)) + if (!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output)) { return false; } // Constraint 2 - if(_operators.size() >= max_fused_operators) + if (_operators.size() >= max_fused_operators) { return false; } // Constraint 3.1: Pattern: (Unfusable) - if(_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable) + if (_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable) { return false; } // Constraint 3.2 - if(_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple)) + if (_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple)) { return false; } // Constraint 4 - if(op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U) + if (op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U) { return false; } // Constraint 5 - if(_operators.size() > 0) + if (_operators.size() > 0) { const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors(); ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); const auto first_dst_tensor = root_dst_tensors[0]; const auto dst_tensors = op.tensors().get_const_dst_tensors(); - for(const auto &t : root_dst_tensors) + for (const auto &t : root_dst_tensors) { - if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) { return false; } } - for(const auto &t : dst_tensors) + for (const auto &t : dst_tensors) { - if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) { return false; } } } // Constraint 6 - if(_operators.size() > 0) + if (_operators.size() > 0) { const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors(); ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout(); const auto dst_tensors = op.tensors().get_const_dst_tensors(); - for(const auto &t : root_dst_tensors) + for (const auto &t : root_dst_tensors) { - if(t->data_layout() != first_dst_tensor_layout) + if (t->data_layout() != first_dst_tensor_layout) { return false; } } - for(const auto &t : dst_tensors) + for (const auto &t : dst_tensors) { - if(t->data_layout() != first_dst_tensor_layout) + if (t->data_layout() != first_dst_tensor_layout) { return false; } @@ -151,16 +146,17 @@ void GpuOperatorGroup::add_operator(const Operator &op, bool is_output) _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output); _operators[op.id()] = op; } -Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack &tensors) const +Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, + const ArgumentPack &tensors) const { auto new_id = static_cast(_operators.size()); - return Operator{ new_id, operator_type, tensors }; + return Operator{new_id, operator_type, tensors}; } const Operator *GpuOperatorGroup::get_root_operator() const { const auto roots = _graph.get_root_ops(); ARM_COMPUTE_ERROR_ON(roots.size() > 1); - if(roots.empty()) + if (roots.empty()) { return nullptr; } diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h index 308a9d796a..0a2369d357 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h @@ -25,9 +25,11 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP #include "arm_compute/core/ITensorInfo.h" + #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h" #include "src/dynamic_fusion/sketch/utils/DependencyGraph.h" + #include namespace arm_compute @@ -104,7 +106,7 @@ public: const Operator *get_root_operator() const; private: - DependencyGraph _graph{}; + DependencyGraph _graph{}; std::map _operators{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp index c2bd012703..36cad790c7 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp @@ -23,7 +23,9 @@ */ #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h" + #include "arm_compute/core/CL/CLCompileContext.h" + #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h" namespace arm_compute @@ -33,7 +35,7 @@ namespace experimental namespace dynamic_fusion { GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx) - : _impl{ std::make_unique(GpuLanguage::OpenCL, cl_compile_ctx) } + : _impl{std::make_unique(GpuLanguage::OpenCL, cl_compile_ctx)} { } @@ -74,7 +76,11 @@ const GpuWorkloadContext::Impl &GpuWorkloadContext::implementation() const } GpuWorkloadContext::Impl::Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx) - : _gpu_language(gpu_language), _cl_compile_ctx(cl_compile_ctx), _next_tensor_id(1), _mem_map(), _managed_tensor_info() + : _gpu_language(gpu_language), + _cl_compile_ctx(cl_compile_ctx), + _next_tensor_id(1), + _mem_map(), + _managed_tensor_info() { } @@ -100,7 +106,7 @@ void GpuWorkloadContext::Impl::register_user_tensor(ITensorInfo &tensor_info) const auto tensor_id = next_tensor_id(); tensor_info.set_id(tensor_id); - _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::User }; + _mem_map[tensor_id] = MemoryDescriptor{MemoryType::User}; // Save a *copy* of the user tensor info in workload context for future reference // Note that this means if the user modifies the @p tensor_info, the change will not be reflected in the context _managed_tensor_info.emplace(tensor_info.id(), std::make_unique(tensor_info)); @@ -111,7 +117,7 @@ ITensorInfo *GpuWorkloadContext::Impl::create_virtual_tensor() auto tensor_info = std::make_unique(); const auto tensor_id = -next_tensor_id(); tensor_info->set_id(tensor_id); - _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Virtual }; + _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Virtual}; auto inserted = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info)); return inserted.first->second.get(); } @@ -121,7 +127,7 @@ ITensorInfo *GpuWorkloadContext::Impl::create_auxiliary_tensor(const ITensorInfo auto tensor_info = std::make_unique(itensor_info); const auto tensor_id = next_tensor_id(); tensor_info->set_id(tensor_id); - _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Auxiliary, AuxMemoryInfo{ tensor_info->total_size() } }; + _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Auxiliary, AuxMemoryInfo{tensor_info->total_size()}}; auto inserted = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info)); return inserted.first->second.get(); } diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h index c169476a70..7d9699031f 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h @@ -27,8 +27,8 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h" +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" namespace arm_compute { @@ -93,8 +93,8 @@ private: GpuLanguage _gpu_language; CLCompileContext *_cl_compile_ctx; - ITensorInfo::Id _next_tensor_id; - MemoryDescriptorMap _mem_map; + ITensorInfo::Id _next_tensor_id; + MemoryDescriptorMap _mem_map; std::map> _managed_tensor_info; }; diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp index d3a20c0dfe..973f7c747f 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" + #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute @@ -30,8 +31,7 @@ namespace experimental { namespace dynamic_fusion { -GpuWorkloadSketch::GpuWorkloadSketch(Context *context) - : _impl{ std::make_unique(context) } +GpuWorkloadSketch::GpuWorkloadSketch(Context *context) : _impl{std::make_unique(context)} { } GpuWorkloadSketch::~GpuWorkloadSketch() diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h index d3033898e9..fea4fe9577 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h @@ -24,8 +24,9 @@ #ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL -#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h" #include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h" @@ -45,12 +46,8 @@ public: * * @param[in] context global workload creation context */ - explicit Implementation( - Context *context) - : _context{ context }, - _comp_services{}, - _component_graph{ _context, &_comp_services }, - _operator_group{} + explicit Implementation(Context *context) + : _context{context}, _comp_services{}, _component_graph{_context, &_comp_services}, _operator_group{} { } /** Prevent instances of this class from being copy constructed */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h index 578366daaf..43bcc47fa0 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h @@ -26,6 +26,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h" @@ -45,7 +46,7 @@ namespace */ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &flat_kernel_args) { - if(flat_kernel_args.empty()) + if (flat_kernel_args.empty()) { return {}; } @@ -56,10 +57,10 @@ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList & flat_kernel_args.pop_front(); const auto tensor_id = karg_head.id(); - while(!flat_kernel_args.empty()) + while (!flat_kernel_args.empty()) { const GpuKernelArgumentBinding &karg = flat_kernel_args.front(); - if(karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments + if (karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments { return tensor_kargs; } @@ -68,7 +69,7 @@ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList & } return tensor_kargs; } -} +} // namespace #endif // ACL_INTERNAL_TEST_CKW_IN_DF /** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */ using UnitWorkloadId = int32_t; @@ -92,9 +93,7 @@ public: GpuWorkloadArgument(const ITensorInfo &tensor_info, const MemoryDescriptor &mem_desc, const GpuKernelArgumentInfo &kernel_arg_info) - : _tensor_info{ tensor_info }, - _mem_desc{ mem_desc }, - _kernel_arg_info{ kernel_arg_info } + : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_arg_info{kernel_arg_info} { } #else // ACL_INTERNAL_TEST_CKW_IN_DF @@ -107,9 +106,7 @@ public: GpuWorkloadArgument(const ITensorInfo &tensor_info, const MemoryDescriptor &mem_desc, const GpuKernelArgumentList &kernel_args) - : _tensor_info{ tensor_info }, - _mem_desc{ mem_desc }, - _kernel_args{ kernel_args } + : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_args{kernel_args} { } #endif // ACL_INTERNAL_TEST_CKW_IN_DF @@ -175,9 +172,9 @@ private: TensorInfo _tensor_info{}; MemoryDescriptor _mem_desc{}; #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - GpuKernelArgumentInfo _kernel_arg_info {}; + GpuKernelArgumentInfo _kernel_arg_info{}; #else // ACL_INTERNAL_TEST_CKW_IN_DF - GpuKernelArgumentList _kernel_args {}; + GpuKernelArgumentList _kernel_args{}; #endif // ACL_INTERNAL_TEST_CKW_IN_DF }; @@ -190,7 +187,7 @@ struct UnitWorkloadStage Prepare, /**< Only run once at the beginning. */ Run, /**< Run every time after the first time. */ }; - Stage stage{ Stage::Run }; + Stage stage{Stage::Run}; }; inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1) @@ -212,7 +209,7 @@ public: * @param[in] stage Stage of the unit workload */ GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage) - : _id{ id }, _kernel_code{ kernel_code }, _stage{ stage } + : _id{id}, _kernel_code{kernel_code}, _stage{stage} { } /** Get the id of the unit workload */ @@ -253,7 +250,10 @@ public: * * @return UnitWorkloadId Allocated unit workload id */ - UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map, const GpuWorkloadContext *context) + UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, + const UnitWorkloadStage &stage, + const MemoryDescriptorMap &mem_map, + const GpuWorkloadContext *context) { // Use the size of the kernel codes as Id const auto uwk_id = static_cast(_unit_workloads.size()); @@ -262,12 +262,13 @@ public: #ifndef ACL_INTERNAL_TEST_CKW_IN_DF ARM_COMPUTE_UNUSED(context); // Assemble kernel argument with memory descriptor to form workload argument - for(const auto &id_arg : kernel_code.arguments()) + for (const auto &id_arg : kernel_code.arguments()) { - const auto arg_id = id_arg.first; - const auto arg = id_arg.second; - _workload_arguments[arg_id] = GpuWorkloadArgument{ *arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info() }; - if(_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end()) + const auto arg_id = id_arg.first; + const auto arg = id_arg.second; + _workload_arguments[arg_id] = + GpuWorkloadArgument{*arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info()}; + if (_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end()) { _tensor_uwork_map[arg_id] = std::set(); } @@ -276,18 +277,19 @@ public: #else // ACL_INTERNAL_TEST_CKW_IN_DF GpuKernelArgumentList flat_kernel_args = kernel_code.arguments(); GpuKernelArgumentList tensor_kargs{}; - while(true) + while (true) { tensor_kargs = extract_kernel_args_for_one_tensor(flat_kernel_args); - if(tensor_kargs.empty()) + if (tensor_kargs.empty()) { break; } else { const auto tensor_id = tensor_kargs.at(0).id(); - _workload_arguments[tensor_id] = GpuWorkloadArgument{ *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs }; - if(_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end()) + _workload_arguments[tensor_id] = GpuWorkloadArgument{ + *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs}; + if (_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end()) { _tensor_uwork_map[tensor_id] = std::set(); } @@ -308,7 +310,7 @@ public: { std::vector ids{}; - for(const auto &uwk : _unit_workloads) + for (const auto &uwk : _unit_workloads) { ids.push_back(uwk.id()); } @@ -323,7 +325,7 @@ public: std::vector tensors() const { std::vector ids{}; - for(const auto &id_tensor : _workload_arguments) + for (const auto &id_tensor : _workload_arguments) { ids.push_back(id_tensor.first); } @@ -337,7 +339,7 @@ public: } private: - std::vector _unit_workloads{}; + std::vector _unit_workloads{}; std::map _workload_arguments{}; std::map> _tensor_uwork_map{}; }; diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h index 1d8b231efd..ad474674f9 100644 --- a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h +++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/Window.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp index 4b4c22fa1d..c4ab110c92 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp @@ -23,6 +23,7 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" + #include "ckw/Error.h" namespace arm_compute @@ -36,12 +37,12 @@ GpuCkwComponentArgument::GpuCkwComponentArgument() { } -GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor) - : _tensor(&tensor) +GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor) { } -GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &tile_sampler) +GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, + const ckw::TensorTileSampler &tile_sampler) { CKW_ASSERT(_tile == nullptr); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h index 80f91389a0..863989a7bd 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h @@ -110,9 +110,9 @@ public: const ckw::TensorTileSampler &tile_sampler() const; private: - ckw::TensorOperand *_tensor{ nullptr }; - ckw::TileOperand *_tile{ nullptr }; - ckw::TensorTileSampler _tile_sampler{}; + ckw::TensorOperand *_tensor{nullptr}; + ckw::TileOperand *_tile{nullptr}; + ckw::TensorTileSampler _tile_sampler{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp index a24a172d77..c927f32bde 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp @@ -23,17 +23,16 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" -#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/Window.h" + #include "src/common/utils/Log.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" - #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" using namespace ckw; namespace arm_compute @@ -43,11 +42,11 @@ namespace experimental namespace dynamic_fusion { GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components) - : _components{ components }, _kernel{ GpuTargetLanguage::OpenCL }, _code{} + : _components{components}, _kernel{GpuTargetLanguage::OpenCL}, _code{} { // Generate kernel name std::string name = ""; - for(auto &comp : _components) + for (auto &comp : _components) { auto ckw_driver = comp->ckw_component_driver(); ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); @@ -60,7 +59,7 @@ GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components) GpuCkwScopedKernelWriter writer(&root_writer); GpuCkwVariableTable vtable{}; - for(auto &comp : _components) + for (auto &comp : _components) { auto ckw_driver = comp->ckw_component_driver(); ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); @@ -82,7 +81,7 @@ std::string GpuCkwDriver::get_code() std::string GpuCkwDriver::get_config_id() { std::string id = ""; - for(auto &comp : _components) + for (auto &comp : _components) { auto ckw_driver = comp->ckw_component_driver(); ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); @@ -101,9 +100,9 @@ Window GpuCkwDriver::get_window() const GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments() { GpuKernelArgumentList args{}; - for(const auto &arg : _kernel.arguments()) + for (const auto &arg : _kernel.arguments()) { - switch(arg.type()) + switch (arg.type()) { case KernelArgument::Type::TensorStorage: { diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h index 19db575fea..2ca5fb435c 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h @@ -24,12 +24,12 @@ #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER +#include "ckw/Kernel.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h" -#include "ckw/Kernel.h" - #include #include diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp index ca4f121566..5f8ce919e3 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp @@ -23,10 +23,12 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" + #include "ckw/Error.h" #include "ckw/TileInfo.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" + namespace arm_compute { namespace experimental @@ -34,21 +36,21 @@ namespace experimental namespace dynamic_fusion { -GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel) - : KernelWriter(kernel) +GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel) { } void GpuCkwKernelWriter::op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler) { - if(!tensor_or_tile->has_tile()) + if (!tensor_or_tile->has_tile()) { CKW_ASSERT(tensor_or_tile->has_tensor()); auto &tensor = tensor_or_tile->tensor(); const auto tile_name = tensor.name() + "_tile"; - auto &tile = declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width())); + auto &tile = + declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width())); op_load(tile, tensor, sampler); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp index 043fda9e6f..cbadbd9639 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp @@ -23,6 +23,7 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h index 4d11b5e3e4..81049bfe37 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h @@ -63,7 +63,7 @@ public: private: GpuCkwKernelWriter *_writer; - int32_t _parent_id_space; + int32_t _parent_id_space; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp index 37c27cd116..88a0cf7f43 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp @@ -23,11 +23,12 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + #include namespace arm_compute @@ -36,19 +37,22 @@ namespace experimental { namespace dynamic_fusion { -GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage, - const std::string &alias) +GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, + GpuCkwScopedKernelWriter &writer, + const ITensorInfo *tensor, + TensorStorageType storage, + const std::string &alias) { ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected"); // Do not re-declare if the variable associated with the tensor has already been declared auto it = _vars.find(tensor->id()); - if(it != _vars.end()) + if (it != _vars.end()) { return &it->second; } - if(comp_group.is_intermediate_tensor(tensor)) + if (comp_group.is_intermediate_tensor(tensor)) { // Create a virtual tensor variable GpuCkwComponentArgument var; @@ -61,7 +65,7 @@ GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelCo std::stringstream ss; ss << alias << "_t" << abs(tensor->id()); const auto uniq_name = ss.str(); - GpuCkwComponentArgument var{ writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage)) }; + GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage))}; auto &&inserted = _vars.emplace(tensor->id(), var); return &(inserted.first->second); } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h index 0649dcba9d..2b118911b8 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h @@ -25,6 +25,7 @@ #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE #include "arm_compute/core/ITensorInfo.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" #include @@ -58,8 +59,11 @@ public: * * @return GpuCkwComponentArgument* */ - GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage, - const std::string &alias = "unnamed"); + GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, + GpuCkwScopedKernelWriter &writer, + const ITensorInfo *tensor, + TensorStorageType storage, + const std::string &alias = "unnamed"); private: std::map _vars{}; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h index 14086f785e..52e56e2e35 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h @@ -25,6 +25,7 @@ #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER #include "arm_compute/core/Window.h" + #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/components/Types.h" @@ -73,8 +74,7 @@ public: * @param[in] id Component id * @param[in] tensors Tensor arguments to the components */ - IGpuCkwComponentDriver(ComponentId id, const ArgumentPack &tensors) - : _id{ id }, _tensors{ tensors } + IGpuCkwComponentDriver(ComponentId id, const ArgumentPack &tensors) : _id{id}, _tensors{tensors} { } /** Destructor */ @@ -89,7 +89,9 @@ public: * * @note @p writer can only be passed via value since the new scope is created in the copy constructor */ - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const = 0; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const = 0; /** Get tensor arguments */ ArgumentPack tensors() const { @@ -128,7 +130,7 @@ public: } private: - ComponentId _id{ -1 }; + ComponentId _id{-1}; ArgumentPack _tensors{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp index c07fac0e0d..c3b1b3c8bc 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp @@ -24,16 +24,18 @@ #include "GpuCkwActivation.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + #include using namespace ckw; @@ -87,24 +89,25 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_ GpuCkwActivation::GpuCkwActivation(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); load_src_dst_tiles_and_prepare_sampler(writer, src, dst, m0, n0, create_sampler); @@ -119,7 +122,7 @@ void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, Gp const auto &constant_B = writer->declare_tile("B_VAL", _attributes.b()); // Perform the operation. - switch(_attributes.activation()) + switch (_attributes.activation()) { case ActivationLayerInfo::ActivationFunction::LOGISTIC: { @@ -179,9 +182,10 @@ Window GpuCkwActivation::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - constexpr unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + constexpr unsigned int vector_size_byte_opencl = 16; + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h index e157e36cbf..386e933a72 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h @@ -46,15 +46,15 @@ public: * @param[in] tensors Tensor arguments to the component * @param[in] attributes Component attributes */ - GpuCkwActivation(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + GpuCkwActivation(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwActivation); /** Destructor */ ~GpuCkwActivation() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp index 6ecf2bac44..e8e5087633 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp @@ -24,16 +24,18 @@ #include "GpuCkwCast.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + #include using namespace ckw; @@ -84,30 +86,29 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_ } } // namespace -GpuCkwCast::GpuCkwCast(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } +GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); // Load the source tile and prepare the sampler. - if(!src->has_tile()) + if (!src->has_tile()) { const auto sampler = create_sampler(writer, m0, n0); writer->op_load_once(src, sampler); @@ -122,7 +123,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa const auto &sampler = src->tile_sampler(); // Prepare the output tile. - if(!dst->has_tile()) + if (!dst->has_tile()) { // Get Target datatype and convert it to ckw::DataType. ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type()); @@ -143,7 +144,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa const size_t dst_size = data_size_from_type(_dst->data_type()); const bool cast_down = (src_size >= dst_size); - if(cast_down && is_data_type_quantized(_src->data_type())) + if (cast_down && is_data_type_quantized(_src->data_type())) { const auto &constant_x80 = writer->declare_tile("0x80", 0x80); writer->op_binary_expression(src_tile, src_tile, BinaryOp::BitwiseXOR, constant_x80); @@ -151,7 +152,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None; - if(cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) + if (cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) { convert_policy = ckw::ConvertPolicy::Saturate; } @@ -167,9 +168,10 @@ Window GpuCkwCast::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - constexpr unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + constexpr unsigned int vector_size_byte_opencl = 16; + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h index 821cec1e19..2389301196 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h @@ -46,15 +46,15 @@ public: * @param[in] tensors Tensor arguments to the component * @param[in] attributes Component attributes */ - GpuCkwCast(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + GpuCkwCast(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwCast); /** Destructor */ ~GpuCkwCast() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp index 3c906646a6..7833da2334 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp @@ -25,21 +25,20 @@ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" - +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" #include "ckw/TileInfo.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" namespace arm_compute { @@ -54,13 +53,7 @@ GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _wei{}, - _bia{}, - _dst{}, - _attributes{ attributes }, - _settings{ settings } + : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -69,7 +62,9 @@ GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId id, ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null } -void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto desc = _settings.direct_conv_descriptor(); ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image, @@ -99,15 +94,18 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, // extra loop to compute the left-over elements. const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (K % 4 == 0); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); GpuCkwComponentArgument *wei = vtable.declare_variable( - comp_group, writer, _wei, use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + comp_group, writer, _wei, + use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); GpuCkwComponentArgument *bia = nullptr; const bool using_bias = _bia != nullptr; - if(using_bias) + if (using_bias) { bia = vtable.declare_variable(comp_group, writer, _bia, TensorStorageType::ClBufferUint8Ptr, "bia"); } @@ -154,7 +152,8 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, src_sampler.address_mode_x(TensorSamplerAddressModeX::None); // We cannot have out-of-bounds reads when the kernel height is equal to 1. Otherwise, we need to ensure the // indirection buffer mi does not contain negative values representing out-of-bounds reads. - src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None : TensorSamplerAddressModeY::SkipMinEdgeOnly); + src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None + : TensorSamplerAddressModeY::SkipMinEdgeOnly); src_sampler.address_mode_z(TensorSamplerAddressModeZ::None); TensorTileSampler wei_sampler; @@ -178,7 +177,7 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, dst_sampler.z(tile_0); dst_sampler.b(tile_bout); - if(!dst->has_tile()) + if (!dst->has_tile()) { auto &tile = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0)); dst->init_virtual_tensor(tile, dst_sampler); @@ -189,10 +188,10 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, // We create a 2d container of size (M0, 1) to store the indices for iteration TileContainer it; - for(int m = 0; m < m0; ++m) + for (int m = 0; m < m0; ++m) { - std::vector idx { std::to_string(m) }; - it.push_back({ idx }); + std::vector idx{std::to_string(m)}; + it.push_back({idx}); } const auto &tile_it = writer->declare_tile("it", it, ckw::DataType::Int32); @@ -289,9 +288,9 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, // Bias addition // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel. - if(using_bias) + if (using_bias) { - if(!bia->has_tile()) + if (!bia->has_tile()) { // Reuse the destination sampler for the bias writer->op_load_once(bia, dst_sampler); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp index c8bf999261..2935ba45ea 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp @@ -24,22 +24,24 @@ #include "GpuCkwElementwiseBinary.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" #include "ckw/types/TensorSamplerTypes.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h" #include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" + #include #include @@ -53,11 +55,7 @@ namespace dynamic_fusion GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _lhs{}, - _rhs{}, - _dst{}, - _attributes{ attributes } + : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes} { _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -65,15 +63,20 @@ GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst); } -void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const auto n0 = static_cast(root_window.x().step()); const auto m0 = static_cast(root_window.y().step()); - GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs"); - GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *lhs = + vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs"); + GpuCkwComponentArgument *rhs = + vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32); auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32); @@ -86,32 +89,36 @@ void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_gr auto &const_0 = writer->declare_tile("0", 0); // Load the LHS and RHS tiles - if(!lhs->has_tile()) + if (!lhs->has_tile()) { - auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1), n0, m0, "lhs_", const_0); + auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1), + n0, m0, "lhs_", const_0); sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension sampler.z(const_0); sampler.b(gid_2); writer->op_load_once(lhs, sampler); } - if(!rhs->has_tile()) + if (!rhs->has_tile()) { - auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1), n0, m0, "rhs_", const_0); + auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1), + n0, m0, "rhs_", const_0); sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension sampler.z(const_0); sampler.b(gid_2); writer->op_load_once(rhs, sampler); } - auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1), n0, m0, "dst_", const_0); + auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1), + n0, m0, "dst_", const_0); dst_sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension dst_sampler.z(const_0); dst_sampler.b(gid_2); // Prepare the output tile. - if(!dst->has_tile()) + if (!dst->has_tile()) { - auto &tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width())); + auto &tile = writer->declare_tile( + "dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width())); dst->init_virtual_tensor(tile, dst_sampler); } @@ -131,9 +138,10 @@ Window GpuCkwElementwiseBinary::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - constexpr unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + constexpr unsigned int vector_size_byte_opencl = 16; + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } @@ -141,11 +149,12 @@ Window GpuCkwElementwiseBinary::get_window() const std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); - const std::vector build_params = - { + const std::vector build_params = { "elementwise_binary", - "op", to_string(_attributes.operation()), - "dt", lower_string(string_from_data_type(_dst->data_type())), + "op", + to_string(_attributes.operation()), + "dt", + lower_string(string_from_data_type(_dst->data_type())), }; return join(build_params, "_"); } @@ -154,13 +163,16 @@ std::string GpuCkwElementwiseBinary::get_tuner_id(const ComponentGroup &comp_gro { ARM_COMPUTE_UNUSED(comp_group); /// NOTE: Hardcoded for now, the parameters should ideally be exported by ckw (a selection of constant tiles) - std::vector build_params = - { + std::vector build_params = { "elementwise_binary", - "op", to_string(_attributes.operation()), - "dt", lower_string(string_from_data_type(_dst->data_type())), - "dst_dim0", support::cpp11::to_string(_dst->dimension(0)), - "dst_dim1", support::cpp11::to_string(_dst->dimension(1)), + "op", + to_string(_attributes.operation()), + "dt", + lower_string(string_from_data_type(_dst->data_type())), + "dst_dim0", + support::cpp11::to_string(_dst->dimension(0)), + "dst_dim1", + support::cpp11::to_string(_dst->dimension(1)), }; return join(build_params, "_"); } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h index e9c41530f8..1a20d4c533 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h @@ -46,17 +46,17 @@ public: * @param[in] tensors Tensor arguments to the component * @param[in] attributes Component attributes */ - GpuCkwElementwiseBinary(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwElementwiseBinary); /** Destructor */ ~GpuCkwElementwiseBinary() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; - std::string get_name(const ComponentGroup &comp_group) const override; - std::string get_tuner_id(const ComponentGroup &comp_group) const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + std::string get_tuner_id(const ComponentGroup &comp_group) const override; private: const ITensorInfo *_lhs; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp index 9c9a298132..8ab3ec3a55 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp @@ -24,17 +24,18 @@ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" using namespace ckw; @@ -48,11 +49,7 @@ GpuCkwPool2d::GpuCkwPool2d(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes }, - _settings{ settings } + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); @@ -60,14 +57,18 @@ GpuCkwPool2d::GpuCkwPool2d(ComponentId id, ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); TileOperand &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32); TileOperand &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32); @@ -90,23 +91,26 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const auto src_data_type = _src->data_type(); // Check if this is global pooling path - const bool is_global_pooling = (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0); + const bool is_global_pooling = + (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0); // Check if this a case of FP_MIXED_PRECISION - const bool use_fp_mixed_precision = (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX; - const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type); + const bool use_fp_mixed_precision = + (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX; + const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type); TileOperand &const_0 = writer->declare_tile("0", 0); const TileOperand &const_1 = writer->declare_tile("1", 1); const TileOperand &const_lowest_value = writer->declare_tile("LOWEST_VALUE", std::numeric_limits::lowest()); const TileOperand &pool_size_x_tile = writer->declare_tile("POOL_SIZE_X", pool_size_x); const TileOperand &pool_size_y_tile = writer->declare_tile("POOL_SIZE_Y", pool_size_y); - const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast(_attributes.stride().x())); - const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast(_attributes.stride().y())); - const TileOperand &pad_x_tile = writer->declare_tile("PAD_X", pad_x); - const TileOperand &pad_y_tile = writer->declare_tile("PAD_Y", pad_y); - const TileOperand &dst_height_tile = writer->declare_tile("DST_HEIGHT", static_cast(_dst->dimension(height_idx))); - const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height); - const TileOperand &src_width_tile = writer->declare_tile("SRC_WIDTH", src_width); + const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast(_attributes.stride().x())); + const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast(_attributes.stride().y())); + const TileOperand &pad_x_tile = writer->declare_tile("PAD_X", pad_x); + const TileOperand &pad_y_tile = writer->declare_tile("PAD_Y", pad_y); + const TileOperand &dst_height_tile = + writer->declare_tile("DST_HEIGHT", static_cast(_dst->dimension(height_idx))); + const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height); + const TileOperand &src_width_tile = writer->declare_tile("SRC_WIDTH", src_width); TileOperand &idx_out_n = writer->declare_tile("idx_out_n", ckw::DataType::Int32); TileOperand &idx_out_h = writer->declare_tile("idx_out_h", ckw::DataType::Int32); @@ -145,7 +149,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw // Prepare dst tensor and tile TileInfo dst_tile_info = TileInfo(to_ckw(src_data_type), m0, n0); - if(!dst->has_tile()) + if (!dst->has_tile()) { TileOperand &dst_tile = writer->declare_tile("dst_tile", dst_tile_info); dst->init_virtual_tensor(dst_tile, dst_sampler); @@ -156,14 +160,15 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const TileOperand &res_tile = writer->declare_tile("res_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); // Initialise result tile with appropriate value - if(_attributes.pool_type() == PoolingType::MAX) + if (_attributes.pool_type() == PoolingType::MAX) { - if(_settings.use_inf_as_limit()) + if (_settings.use_inf_as_limit()) { TileContainer minus_inf_tile_container; std::vector value = std::vector(n0, "(-INFINITY)"); - minus_inf_tile_container.push_back({ value }); - const TileOperand &minus_inf = writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type)); + minus_inf_tile_container.push_back({value}); + const TileOperand &minus_inf = + writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type)); writer->op_assign(res_tile, minus_inf); } else @@ -209,7 +214,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw writer->op_binary_elementwise_function(pool_y_e, BinaryFunction::Min, pool_size_y_tile, pool_y_e); const TileOperand &filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32); - if(_attributes.exclude_padding()) + if (_attributes.exclude_padding()) { const TileOperand &y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32); const TileOperand &x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32); @@ -227,7 +232,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const TileOperand &x = writer->declare_tile("x", ckw::DataType::Int32); const TileOperand &y = writer->declare_tile("y", ckw::DataType::Int32); - if(is_global_pooling) + if (is_global_pooling) { writer->op_assign(x, const_0); writer->op_assign(y, const_0); @@ -242,76 +247,80 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw } // Y dim for-loop - writer->op_for_loop(y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1, [&]() - { - // Reset the iterator for the inner loop - if(is_global_pooling) - { - writer->op_assign(x, const_0); - } - else + writer->op_for_loop( + y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1, + [&]() { - writer->op_assign(x, pool_x_s); - } - - TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32); - writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y); - - // X dim for-loop - writer->op_for_loop(x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1, [&]() - { - TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32); - writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x); - - TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); - - src_sampler.y(a_x); - src_sampler.z(a_y); - - // Load src tile - if(use_fp_mixed_precision) + // Reset the iterator for the inner loop + if (is_global_pooling) { - TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info); - writer->op_load(src_uncasted_tile, src->tensor(), src_sampler); - writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None); + writer->op_assign(x, const_0); } else { - writer->op_load(src_tile, src->tensor(), src_sampler); + writer->op_assign(x, pool_x_s); } - // Take the square of the input, for L2 Pooling - if(_attributes.pool_type() == PoolingType::L2) - { - writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile); - } - - // Perfom Pooling op - if(_attributes.pool_type() == PoolingType::MAX) - { - writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile); - } - else - { - writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile); - } + TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32); + writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y); + + // X dim for-loop + writer->op_for_loop( + x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1, + [&]() + { + TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32); + writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x); + + TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); + + src_sampler.y(a_x); + src_sampler.z(a_y); + + // Load src tile + if (use_fp_mixed_precision) + { + TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info); + writer->op_load(src_uncasted_tile, src->tensor(), src_sampler); + writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None); + } + else + { + writer->op_load(src_tile, src->tensor(), src_sampler); + } + + // Take the square of the input, for L2 Pooling + if (_attributes.pool_type() == PoolingType::L2) + { + writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile); + } + + // Perfom Pooling op + if (_attributes.pool_type() == PoolingType::MAX) + { + writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile); + } + else + { + writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile); + } + }); }); - }); - if((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2)) + if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2)) { // filter_size is automatically broadcasted in the operation writer->op_binary_expression(res_tile, res_tile, BinaryOp::Div, filter_size); } // Take square root of the result in L2 pooling - if(_attributes.pool_type() == PoolingType::L2) + if (_attributes.pool_type() == PoolingType::L2) { writer->op_unary_elementwise_function(res_tile, UnaryFunction::Sqrt, res_tile); } // Store the results and do casting if FP_MIXED_PRECISION - if(use_fp_mixed_precision) + if (use_fp_mixed_precision) { writer->op_cast_expression(dst_tile, res_tile, ckw::ConvertPolicy::None); } @@ -326,7 +335,7 @@ Window GpuCkwPool2d::get_window() const ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); TensorShape output_shape = _dst->tensor_shape(); - const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); + const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); // Create and configure kernel window auto win = calculate_max_window(output_shape, Steps(vec_size)); win = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size. diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h index 2ccf255236..822282a108 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h @@ -59,9 +59,11 @@ public: /** Destructor */ ~GpuCkwPool2d() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; - std::string get_name(const ComponentGroup &comp_group) const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp index d997c82dae..f2a7d41afd 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp @@ -28,14 +28,13 @@ #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/ScaleUtils.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" - +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -49,20 +48,17 @@ namespace constexpr unsigned int opencl_vector_size_in_bytes = 16; } // namespace -GpuCkwResize::GpuCkwResize(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } +GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST); ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const size_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH); const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT); @@ -72,12 +68,16 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, const int32_t m0 = root_window.y().step(); const int32_t partial_n0 = _dst->dimension(0) % n0; - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); // Constants - const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), _attributes.align_corners()); - const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), _attributes.align_corners()); + const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), + _attributes.align_corners()); + const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), + _attributes.align_corners()); const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x); const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y); const auto &tile_0 = writer->declare_tile("0", 0); @@ -112,7 +112,7 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32); const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32); - switch(_attributes.sampling_policy()) + switch (_attributes.sampling_policy()) { case SamplingPolicy::TOP_LEFT: // xi_f = (xo * scale_x) @@ -138,7 +138,7 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, ARM_COMPUTE_ERROR("Unsupported sampling policy"); } - if(_attributes.align_corners()) + if (_attributes.align_corners()) { writer->op_unary_elementwise_function(tile_xi_f, UnaryFunction::Round, tile_xi_f); writer->op_unary_elementwise_function(tile_yi_f, UnaryFunction::Round, tile_yi_f); @@ -161,8 +161,10 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, auto &tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32); auto &tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32); - writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0, tile_src_w_minus_1); - writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0, tile_src_h_minus_1); + writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0, + tile_src_w_minus_1); + writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0, + tile_src_h_minus_1); TensorTileSampler src_sampler; src_sampler.x(tile_co); @@ -199,7 +201,9 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, writer->op_assign(tile_dst, tile_src); } -void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const size_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH); const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT); @@ -209,12 +213,16 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa const int32_t m0 = root_window.y().step(); const int32_t partial_n0 = _dst->dimension(0) % n0; - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); // Constants - const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), _attributes.align_corners()); - const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), _attributes.align_corners()); + const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), + _attributes.align_corners()); + const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), + _attributes.align_corners()); const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x); const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y); const auto &tile_0 = writer->declare_tile("0", 0); @@ -251,7 +259,7 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32); const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32); - switch(_attributes.sampling_policy()) + switch (_attributes.sampling_policy()) { case SamplingPolicy::TOP_LEFT: // xi_f = (xo * scale_x) @@ -312,8 +320,10 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi, tile_0, tile_src_w_minus_1); writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi, tile_0, tile_src_h_minus_1); - writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0, tile_src_w_minus_1); - writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0, tile_src_h_minus_1); + writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0, + tile_src_w_minus_1); + writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0, + tile_src_h_minus_1); TensorTileSampler in_sampler; in_sampler.x(tile_co); @@ -388,7 +398,7 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa writer->op_binary_expression(tile_a1, tile_yi_f, BinaryOp::Sub, tile_yi_float); writer->op_binary_expression(tile_b1, tile_1, BinaryOp::Sub, tile_a1); - if(is_data_type_float(_src->data_type())) + if (is_data_type_float(_src->data_type())) { // Cast weights to source type const auto &tile_a_src_type = writer->declare_tile("a_src_t", to_ckw(_src->data_type())); @@ -461,9 +471,11 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa } } -void GpuCkwResize::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwResize::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { - switch(_attributes.interpolation_policy()) + switch (_attributes.interpolation_policy()) { case InterpolationPolicy::NEAREST_NEIGHBOR: do_nearest_neighbor_resize(comp_group, vtable, writer); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp index 8917391537..889706b0c0 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp @@ -24,10 +24,12 @@ #include "GpuCkwStore.h" #include "arm_compute/core/Error.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" + #include namespace arm_compute @@ -37,12 +39,14 @@ namespace experimental namespace dynamic_fusion { GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack &tensors) - : IGpuCkwComponentDriver{ id, tensors }, _src{}, _dst{} + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); } -void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { auto src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); auto dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h index 8e35651caf..f1f0e6747b 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h @@ -48,8 +48,10 @@ public: /** Destructor */ ~GpuCkwStore() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - std::string get_name(const ComponentGroup &comp_group) const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + std::string get_name(const ComponentGroup &comp_group) const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h index e2b8584b99..6ba2b2f651 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/misc/Utility.h" #include "ckw/TensorTileSampler.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" @@ -44,9 +45,14 @@ using SamplerCreator = std::functionhas_tile()) + if (!src->has_tile()) { const auto sampler = create_sampler(writer, m0, n0); writer->op_load_once(src, sampler); @@ -61,7 +67,7 @@ inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &wri const auto &sampler = src->tile_sampler(); // Prepare the output tile. - if(!dst->has_tile()) + if (!dst->has_tile()) { auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info()); dst->init_virtual_tensor(tile, sampler); @@ -78,7 +84,13 @@ inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &wri * @param[in] prefix Prefix to all the tiles declared within this function * @param[in] const_0 Constant tile of value 0 */ -inline void get_coord(GpuCkwScopedKernelWriter writer, TileOperand &coord, const TileOperand &gid, int32_t step_v, int32_t leftover_step_v, const std::string &prefix, const TileOperand &const_0) +inline void get_coord(GpuCkwScopedKernelWriter writer, + TileOperand &coord, + const TileOperand &gid, + int32_t step_v, + int32_t leftover_step_v, + const std::string &prefix, + const TileOperand &const_0) { auto &step = writer->declare_tile(prefix + "step", step_v); auto &leftover_step = writer->declare_tile(prefix + "leftover_step", leftover_step_v); @@ -122,8 +134,15 @@ inline void get_coord(GpuCkwScopedKernelWriter writer, TileOperand &coord, const * * @return TensorTileSampler */ -inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer, TileOperand &gid_0, TileOperand &gid_1, int32_t dim0_v, int32_t dim1_v, int32_t n0_v, int32_t m0_v, - const std::string prefix, TileOperand &const_0) +inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer, + TileOperand &gid_0, + TileOperand &gid_1, + int32_t dim0_v, + int32_t dim1_v, + int32_t n0_v, + int32_t m0_v, + const std::string prefix, + TileOperand &const_0) { // Clamp tile size [n0, m0] against dimension [dim0, dim1] // This is needed to: diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h index 34b1283add..5da317bf38 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" #include "ckw/TensorInfo.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" namespace arm_compute @@ -38,7 +39,7 @@ namespace dynamic_fusion { inline ckw::DataType to_ckw(DataType dt) { - switch(dt) + switch (dt) { case DataType::F32: return ckw::DataType::Fp32; @@ -65,21 +66,16 @@ inline ckw::DataType to_ckw(DataType dt) inline ckw::TensorShape to_ckw(const TensorShape &shape) { - ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size {}); - ARM_COMPUTE_ERROR_ON(std::tuple_size {} != 5); + ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size{}); + ARM_COMPUTE_ERROR_ON(std::tuple_size{} != 5); /// NOTE: Overflow danger. Use size_t? - return ckw::TensorShape - { - static_cast(shape[0]), - static_cast(shape[1]), - static_cast(shape[2]), - static_cast(shape[3]), - static_cast(shape[4]) - }; + return ckw::TensorShape{static_cast(shape[0]), static_cast(shape[1]), + static_cast(shape[2]), static_cast(shape[3]), + static_cast(shape[4])}; } inline ckw::TensorDataLayout to_ckw(DataLayout dl) { - switch(dl) + switch (dl) { case DataLayout::NHWC: return ckw::TensorDataLayout::Nhwc; @@ -91,18 +87,13 @@ inline ckw::TensorDataLayout to_ckw(DataLayout dl) } inline ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info) { - return ckw::TensorInfo - { - to_ckw(tensor_info.data_type()), - to_ckw(tensor_info.tensor_shape()), - to_ckw(tensor_info.data_layout()), - tensor_info.id() - }; + return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()), + to_ckw(tensor_info.data_layout()), tensor_info.id()}; } inline TensorComponentType from_ckw(const ckw::TensorComponentType &component) { - switch(component) + switch (component) { case ckw::TensorComponentType::OffsetFirstElement: return TensorComponentType::OffsetFirstElement; @@ -142,7 +133,7 @@ inline TensorComponentType from_ckw(const ckw::TensorComponentType &component) inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage) { - switch(storage) + switch (storage) { case TensorStorageType::ClBufferUint8Ptr: return ckw::TensorStorageType::BufferUint8Ptr; @@ -159,7 +150,7 @@ inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage) } inline TensorStorageType from_ckw(const ckw::TensorStorageType &storage) { - switch(storage) + switch (storage) { case ckw::TensorStorageType::BufferUint8Ptr: return TensorStorageType::ClBufferUint8Ptr; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h index 9cb022fc10..0cba258940 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h @@ -25,6 +25,7 @@ #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY #include "ckw/types/Operators.h" + #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" namespace arm_compute @@ -35,7 +36,7 @@ namespace dynamic_fusion { inline ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes) { - switch(attributes.operation()) + switch (attributes.operation()) { case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add: return ckw::BinaryOp::Add; diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h index f7f0029618..ee109a7e2b 100644 --- a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h +++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h @@ -24,8 +24,9 @@ #ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY -#include "Types.h" #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +#include "Types.h" #include namespace arm_compute @@ -49,13 +50,13 @@ public: * @return std::unique_ptr */ template - std::unique_ptr create(Args &&... args) + std::unique_ptr create(Args &&...args) { return std::make_unique(_count++, std::forward(args)...); } private: - ComponentId _count{ 0 }; + ComponentId _count{0}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h index af766a7ece..4b8eea2f57 100644 --- a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h +++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h @@ -24,11 +24,11 @@ #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT -#include "Types.h" - #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" +#include "Types.h" + namespace arm_compute { namespace experimental @@ -76,13 +76,8 @@ public: * @param[in] properties Kernel component properties * @param[in] tensors Tensor arguments to the components */ - IGpuKernelComponent( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors) - : _id{ id }, - _properties{ properties }, - _tensors{ tensors } + IGpuKernelComponent(ComponentId id, const Properties &properties, const ArgumentPack &tensors) + : _id{id}, _properties{properties}, _tensors{tensors} { } /** Destructor */ @@ -117,7 +112,7 @@ public: virtual GpuComponentType type() const = 0; private: - ComponentId _id{ -1 }; + ComponentId _id{-1}; Properties _properties{}; ArgumentPack _tensors{}; }; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp index c41257d18c..fdf528a65d 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp @@ -68,17 +68,11 @@ ClComponentActivation::ClComponentActivation(ComponentId const IGpuKernelComponent::Properties &properties, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuKernelComponent{ id, properties, tensors }, + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #else //ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #endif //ACL_INTERNAL_TEST_CKW_IN_DF { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h index 9b090af988..02c854356a 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h @@ -25,9 +25,8 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION #include "arm_compute/function_info/ActivationLayerInfo.h" -#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" -#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute { @@ -79,20 +78,17 @@ public: * |F16 |F16 | * |F32 |F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + static Status + validate(const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes); /** Constructor * * Similar to @ref ClComponentActivation::validate() */ - ClComponentActivation( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + ClComponentActivation(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes); /** Destructor */ ~ClComponentActivation() override; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp index 635869f817..b1636795a3 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp @@ -24,6 +24,7 @@ #include "ClComponentCast.h" #include "arm_compute/core/Error.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" #ifndef ACL_INTERNAL_TEST_CKW_IN_DF @@ -38,11 +39,10 @@ namespace experimental { namespace dynamic_fusion { -Status ClComponentCast::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) +Status ClComponentCast::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) { ARM_COMPUTE_UNUSED(properties, attributes, settings); @@ -53,13 +53,15 @@ Status ClComponentCast::validate( ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(src == dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(), "input and target data types should be different"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(), + "input and target data types should be different"); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(), "dst and target data types should be same"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(), + "dst and target data types should be same"); } return Status{}; @@ -69,17 +71,11 @@ ClComponentCast::ClComponentCast(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuKernelComponent{ id, properties, tensors }, + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #else //ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #endif //ACL_INTERNAL_TEST_CKW_IN_DF { ARM_COMPUTE_UNUSED(attributes, settings); diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h index 37b8cbb6c9..ed77b1203b 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute @@ -93,11 +94,10 @@ public: * |F16 | U8, S8, U16, S16, U32, S32, F32 | * |F32 | U8, S8, U16, S16, U32, S32, F16 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + static Status validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Constructor * diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp index 5626093079..d95e0be1f2 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h" @@ -103,11 +104,10 @@ unsigned int Settings::m0() const return _m0; } -Status ClComponentDepthwiseConv2d::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) +Status ClComponentDepthwiseConv2d::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) { ARM_COMPUTE_UNUSED(properties, settings); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); @@ -121,7 +121,7 @@ Status ClComponentDepthwiseConv2d::validate( // Matching data type ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia); } @@ -129,7 +129,7 @@ Status ClComponentDepthwiseConv2d::validate( // Matching data layout ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia); } @@ -138,7 +138,7 @@ Status ClComponentDepthwiseConv2d::validate( ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0); ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0); } @@ -148,16 +148,17 @@ Status ClComponentDepthwiseConv2d::validate( const DataLayout data_layout = src->data_layout(); const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * attributes.depth_multiplier())); + ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != + (src->dimension(channel_idx) * attributes.depth_multiplier())); ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 3, "Weights can be at most 3 dimensional"); // dst shape is correct - const PadStrideInfo pad_stride_info = PadStrideInfo(attributes.stride().x(), attributes.stride().y(), - attributes.pad().left, attributes.pad().right, - attributes.pad().top, attributes.pad().bottom, - attributes.dimension_rounding_type()); - const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() }; - const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); + const PadStrideInfo pad_stride_info = + PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, attributes.dimension_rounding_type()); + const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), + attributes.dilation()}; + const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); @@ -168,19 +169,22 @@ Status ClComponentDepthwiseConv2d::validate( ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && settings.m0() != 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && settings.m0() != 1); - if(conv_info.depth_multiplier > 1 && settings.n0() > 1) + if (conv_info.depth_multiplier > 1 && settings.n0() > 1) { ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % settings.n0()) != 0); } // Check export weights to cl image - ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) && (export_to_cl_image(wei) == false), "Weights cannot be exported to cl_image!"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) && + (export_to_cl_image(wei) == false), + "Weights cannot be exported to cl_image!"); ARM_COMPUTE_RETURN_ERROR_ON((settings.export_weights_to_cl_image() == true) && ((settings.n0() % 4) != 0)); - ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * conv_info.depth_multiplier)); + ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != + (src->dimension(channel_idx) * conv_info.depth_multiplier)); // bia shape is correct - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != output_shape[channel_idx], "Biases size and number of dst feature maps should match"); @@ -198,14 +202,13 @@ Status ClComponentDepthwiseConv2d::validate( return Status{}; } -ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) - : IGpuKernelComponent{ id, properties, tensors }, - _component_writer{ std::make_unique(id, tensors, attributes, settings) } +ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique(id, tensors, attributes, settings)} { } ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d() diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h index 0e2b5f14cb..b3e1bd222d 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h @@ -25,7 +25,9 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D #include "arm_compute/core/Error.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + #include namespace arm_compute @@ -77,12 +79,12 @@ public: unsigned int m0() const; private: - bool _export_input_to_cl_image{ false }; /**< Export input to cl_image */ - bool _export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */ - bool _fast_relaxed_math{ true }; /**< Enable/disable -cl-fast-relaxed-math flag */ - bool _is_fma_available{ false }; /**< Is fma instruction available */ - unsigned int _n0{ 0 }; /**< Number of columns processed by each thread */ - unsigned int _m0{ 0 }; /**< Number of rows processed by each thread */ + bool _export_input_to_cl_image{false}; /**< Export input to cl_image */ + bool _export_weights_to_cl_image{false}; /**< Export the weights to cl_image */ + bool _fast_relaxed_math{true}; /**< Enable/disable -cl-fast-relaxed-math flag */ + bool _is_fma_available{false}; /**< Is fma instruction available */ + unsigned int _n0{0}; /**< Number of columns processed by each thread */ + unsigned int _m0{0}; /**< Number of rows processed by each thread */ }; /** Forward declaration */ @@ -127,22 +129,20 @@ public: * |F16 |F16 |F16 |F16 | * |F32 |F32 |F32 |F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + static Status validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Constructor * * Similar to @ref ClComponentDepthwiseConv2d::validate() */ - ClComponentDepthwiseConv2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + ClComponentDepthwiseConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Destructor */ ~ClComponentDepthwiseConv2d() override; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp index a713c82003..98f3d6a882 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp @@ -23,8 +23,8 @@ */ #include "ClComponentDirectConv2d.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h" #include "src/core/CL/CLValidate.h" @@ -57,7 +57,8 @@ bool ClComponentDirectConv2dSettings::fast_relaxed_math() const return _fast_relaxed_math; } -ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc) +ClComponentDirectConv2dSettings & +ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc) { _desc = desc; return *this; @@ -68,11 +69,10 @@ DirectConvComputeKernelInfo ClComponentDirectConv2dSettings::direct_conv_descrip return _desc; } -Status ClComponentDirectConv2d::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) +Status ClComponentDirectConv2d::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) { ARM_COMPUTE_UNUSED(properties); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); @@ -86,7 +86,7 @@ Status ClComponentDirectConv2d::validate( // Matching data type ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia); } @@ -94,7 +94,7 @@ Status ClComponentDirectConv2d::validate( // Matching data layout ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia); } @@ -103,7 +103,7 @@ Status ClComponentDirectConv2d::validate( ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0); ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0); } @@ -112,22 +112,23 @@ Status ClComponentDirectConv2d::validate( // wei shape is correct const DataLayout data_layout = src->data_layout(); const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional"); // dst shape is correct - PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, attributes.pad().top, - attributes.pad().bottom, DimensionRoundingType{}); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride)); + PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, + DimensionRoundingType{}); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride)); // bia shape is correct - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3), "Biases size and number of dst feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, - "Biases should be one dimensional"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional"); } // 2. Check support level @@ -137,24 +138,25 @@ Status ClComponentDirectConv2d::validate( ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); const auto desc = settings.direct_conv_descriptor(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && + desc.n0 != 16, "N0 can only be: 1, 2, 3, 4, 8, and 16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && + desc.k0 != 16, "K0 can only be: 1, 2, 3, 4, 8, and 16"); return Status{}; } -ClComponentDirectConv2d::ClComponentDirectConv2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) - : IGpuKernelComponent{ id, properties, tensors }, +ClComponentDirectConv2d::ClComponentDirectConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer{ std::make_unique(id, tensors, attributes, settings) } -#else // ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer{ std::make_unique(id, tensors, attributes, settings) } + _component_writer{std::make_unique(id, tensors, attributes, settings)} +#else // ACL_INTERNAL_TEST_CKW_IN_DF + _component_writer{std::make_unique(id, tensors, attributes, settings)} #endif // ACL_INTERNAL_TEST_CKW_IN_DF { } @@ -165,7 +167,7 @@ ClComponentDirectConv2d::~ClComponentDirectConv2d() #ifndef ACL_INTERNAL_TEST_CKW_IN_DF const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const -#else // ACL_INTERNAL_TEST_CKW_IN_DF +#else // ACL_INTERNAL_TEST_CKW_IN_DF const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const #endif // ACL_INTERNAL_TEST_CKW_IN_DF { diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h index 24acb1b2c1..d6d9705d3c 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h @@ -26,7 +26,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + #include namespace arm_compute @@ -61,7 +63,7 @@ public: DirectConvComputeKernelInfo direct_conv_descriptor() const; private: - bool _fast_relaxed_math{ true }; + bool _fast_relaxed_math{true}; DirectConvComputeKernelInfo _desc{}; // Direct convolution descriptor }; @@ -111,22 +113,20 @@ public: * |F16 |F16 |F16 |F16 | * |F32 |F32 |F32 |F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + static Status validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Constructor * * Similar to @ref ClComponentDirectConv2d::validate() */ - ClComponentDirectConv2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + ClComponentDirectConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Destructor */ ~ClComponentDirectConv2d() override; @@ -142,7 +142,7 @@ public: #ifndef ACL_INTERNAL_TEST_CKW_IN_DF const IGpuTemplateComponentWriter *template_writer() const override; #else // ACL_INTERNAL_TEST_CKW_IN_DF - const IGpuCkwComponentDriver *ckw_component_driver() const override; + const IGpuCkwComponentDriver *ckw_component_driver() const override; #endif // ACL_INTERNAL_TEST_CKW_IN_DF /** Get component type */ GpuComponentType type() const override diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp index 88d729170c..5b136427e4 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp @@ -24,6 +24,7 @@ #include "ClComponentElementwiseBinary.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #ifndef ACL_INTERNAL_TEST_CKW_IN_DF #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h" @@ -39,56 +40,55 @@ namespace dynamic_fusion { namespace { -std::set supported_ops -{ - ElementwiseBinaryCommonAttributes::ElementwiseOp::Add, - ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub, - ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul -}; +std::set supported_ops{ + ElementwiseBinaryCommonAttributes::ElementwiseOp::Add, ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub, + ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul}; } -Status ClComponentElementwiseBinary::validate(const ArgumentPack &tensors, const ElementwiseBinaryCommonAttributes &attributes) +Status ClComponentElementwiseBinary::validate(const ArgumentPack &tensors, + const ElementwiseBinaryCommonAttributes &attributes) { const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0); const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1); const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); // Check operator type - ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(), "Provided Elementwise operation not supported."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(), + "Provided Elementwise operation not supported."); // Check validity ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); //Check data type for different elementwise operators - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32, DataType::S16, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32, + DataType::S16, DataType::U8); // dst shape is correct const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst."); const auto &lhs_shape = lhs->tensor_shape(); const auto &rhs_shape = rhs->tensor_shape(); const auto &dst_shape = dst->tensor_shape(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(lhs_shape, dst_shape, 0) && detail::have_different_dimensions(rhs_shape, dst_shape, 0), - "Only LHS or RHS can be broadcasting, not both."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 0) && + detail::have_different_dimensions(rhs_shape, dst_shape, 0), + "Only LHS or RHS can be broadcasting, not both."); // Dimension Y and Z are collapsed together in the current kernel implementation, // hence they cannot be independently broadcast or non-broadcast. // See: ClTemplateElementwiseBinary::get_window - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - (lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) != (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]), - "Dimension Y and Z must both be either broadcast or non-broadcast."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) != + (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]), + "Dimension Y and Z must both be either broadcast or non-broadcast."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(lhs_shape, dst_shape, 3), - "LHS broadcast in dimension 3 or higher is not supported."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 3), + "LHS broadcast in dimension 3 or higher is not supported."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(rhs_shape, dst_shape, 3), - "RHS broadcast in dimension 3 or higher is not supported."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(rhs_shape, dst_shape, 3), + "RHS broadcast in dimension 3 or higher is not supported."); // Matching data type ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); @@ -112,22 +112,15 @@ Status ClComponentElementwiseBinary::validate(const ArgumentPack &t ClComponentElementwiseBinary::~ClComponentElementwiseBinary() { } -ClComponentElementwiseBinary::ClComponentElementwiseBinary( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes) - : IGpuKernelComponent{ id, properties, tensors }, +ClComponentElementwiseBinary::ClComponentElementwiseBinary(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes) + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #else //ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #endif //ACL_INTERNAL_TEST_CKW_IN_DF { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h index f7175903d0..7589b9732c 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h @@ -82,17 +82,17 @@ public: * |S16 |S16 |S16 | * |U8 |U8 |U8 | */ - static Status validate(const ArgumentPack &tensors, const ElementwiseBinaryCommonAttributes &attributes); + static Status validate(const ArgumentPack &tensors, + const ElementwiseBinaryCommonAttributes &attributes); /** Constructor * * Similar to @ref ClComponentElementwiseBinary::validate() */ - ClComponentElementwiseBinary( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + ClComponentElementwiseBinary(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes); /** Destructor */ ~ClComponentElementwiseBinary() override; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp index 279c77e227..27c13bd654 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp @@ -25,9 +25,10 @@ #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h" @@ -37,10 +38,9 @@ namespace experimental { namespace dynamic_fusion { -Status ClComponentLogits1DMaxShiftExpSum::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes) +Status ClComponentLogits1DMaxShiftExpSum::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes) { ARM_COMPUTE_UNUSED(properties, attributes); @@ -75,8 +75,8 @@ ClComponentLogits1DMaxShiftExpSum::ClComponentLogits1DMaxShiftExpSum(ComponentId const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuKernelComponent{ id, properties, tensors }, - _component_writer{ std::make_unique(id, tensors, attributes) } + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique(id, tensors, attributes)} { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h index b5db458248..91ab5de3b5 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DMAXSHIFTEXPSUM #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute @@ -89,10 +90,8 @@ public: * |F16 | F16 | F16 | * |F32 | F32 | F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + static Status + validate(const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes); /** Constructor * diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp index 7864d56d29..fb2544385c 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp @@ -25,9 +25,10 @@ #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h" @@ -37,10 +38,9 @@ namespace experimental { namespace dynamic_fusion { -Status ClComponentLogits1DNorm::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes) +Status ClComponentLogits1DNorm::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes) { ARM_COMPUTE_UNUSED(properties, attributes); @@ -77,8 +77,8 @@ ClComponentLogits1DNorm::ClComponentLogits1DNorm(ComponentId const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuKernelComponent{ id, properties, tensors }, - _component_writer{ std::make_unique(id, tensors, attributes) } + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique(id, tensors, attributes)} { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h index 5bd350b9bd..74c0273604 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DNORM #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute @@ -86,10 +87,8 @@ public: * |F16 | F16 | F16 | * |F32 | F32 | F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + static Status + validate(const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes); /** Constructor * diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp index d415769094..409b191df5 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp @@ -24,13 +24,15 @@ #include "ClComponentPool2d.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h" #include "src/dynamic_fusion/utils/Utils.h" + #include namespace arm_compute @@ -39,23 +41,24 @@ namespace experimental { namespace dynamic_fusion { -Status ClComponentPool2d::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) +Status ClComponentPool2d::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) { ARM_COMPUTE_UNUSED(properties); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX), "Unsupported Pooling type"); + ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX), + "Unsupported Pooling type"); // 1. Check validity // Check if pooling is valid - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())), - "Pooling region that is entirely outside input tensor is unsupported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())), + "Pooling region that is entirely outside input tensor is unsupported"); // Matching data type ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); @@ -70,8 +73,9 @@ Status ClComponentPool2d::validate( // Device requirements are met ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()))); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), misc::shape_calculator::compute_pool_shape( + *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()))); // 2. Check support level // Data type @@ -83,23 +87,16 @@ Status ClComponentPool2d::validate( return Status{}; } -ClComponentPool2d::ClComponentPool2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) - : IGpuKernelComponent{ id, properties, tensors }, +ClComponentPool2d::ClComponentPool2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes, settings) -} + _component_writer{std::make_unique(id, tensors, attributes, settings)} #else //ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes, settings) -} + _component_writer{std::make_unique(id, tensors, attributes, settings)} #endif //ACL_INTERNAL_TEST_CKW_IN_DF { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h index 6814bf9243..98fed65004 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h @@ -25,6 +25,7 @@ #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute @@ -82,11 +83,10 @@ public: * |F16 |F16 | * |F32 |F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + static Status validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Constructor * @@ -96,12 +96,11 @@ public: * @param[in] attributes Component attributes * @param[in] settings Component settings */ - ClComponentPool2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + ClComponentPool2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Destructor */ ~ClComponentPool2d() override; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp index 66e2ee6956..0ece9de970 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "ClComponentReshape.h" + #include "arm_compute/core/Error.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h" @@ -49,12 +51,10 @@ Status ClComponentReshape::validate(const ArgumentPack &tensors) return Status{}; } -ClComponentReshape::ClComponentReshape( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors) - : IGpuKernelComponent{ id, properties, tensors }, - _component_writer{ std::make_unique(id, tensors) } +ClComponentReshape::ClComponentReshape(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors) + : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique(id, tensors)} { } ClComponentReshape::~ClComponentReshape() diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h index f8d165b4c8..78163d6603 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h @@ -73,10 +73,7 @@ public: * @param[in] properties Component properties @ref Properties * @param[in] tensors Tensor arguments to the component */ - ClComponentReshape( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors); + ClComponentReshape(ComponentId id, const Properties &properties, const ArgumentPack &tensors); /** Destructor */ ~ClComponentReshape() override; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp index 6df1d9b3db..b05eb04698 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp @@ -66,7 +66,9 @@ Status ClComponentResize::validate(const IGpuKernelComponent::Properties &proper ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); // Align corners and sampling policy conformance - ARM_COMPUTE_RETURN_ERROR_ON(attributes.align_corners() && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy())); + ARM_COMPUTE_RETURN_ERROR_ON( + attributes.align_corners() && + !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy())); // All tensor infos are initialized ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); @@ -79,11 +81,11 @@ ClComponentResize::ClComponentResize(ComponentId id, const IGpuKernelComponent::Properties &properties, const ArgumentPack &tensors, const ClComponentResize::Attributes &attributes) - : IGpuKernelComponent{ id, properties, tensors }, + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer{ std::make_unique(id, tensors, attributes) } -#else // ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer{ std::make_unique(id, tensors, attributes) } + _component_writer{std::make_unique(id, tensors, attributes)} +#else // ACL_INTERNAL_TEST_CKW_IN_DF + _component_writer{std::make_unique(id, tensors, attributes)} #endif // ACL_INTERNAL_TEST_CKW_IN_DF { } @@ -94,7 +96,7 @@ ClComponentResize::~ClComponentResize() #ifndef ACL_INTERNAL_TEST_CKW_IN_DF const IGpuTemplateComponentWriter *ClComponentResize::template_writer() const -#else // ACL_INTERNAL_TEST_CKW_IN_DF +#else // ACL_INTERNAL_TEST_CKW_IN_DF const IGpuCkwComponentDriver *ClComponentResize::ckw_component_driver() const #endif // ACL_INTERNAL_TEST_CKW_IN_DF { diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h index 474524f8fc..29276c3257 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h @@ -26,6 +26,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE #include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute @@ -43,7 +44,7 @@ class ArgumentPack; /** Forward declaration */ #ifndef ACL_INTERNAL_TEST_CKW_IN_DF class ClTemplateResize; -#else // ACL_INTERNAL_TEST_CKW_IN_DF +#else // ACL_INTERNAL_TEST_CKW_IN_DF class GpuCkwResize; #endif // ACL_INTERNAL_TEST_CKW_IN_DF @@ -82,10 +83,8 @@ public: * |U8 |U8 | * |S16 |S16 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + static Status + validate(const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes); /** Constructor * @@ -114,7 +113,7 @@ public: /** Get writer for the component */ #ifndef ACL_INTERNAL_TEST_CKW_IN_DF const IGpuTemplateComponentWriter *template_writer() const override; -#else // ACL_INTERNAL_TEST_CKW_IN_DF +#else // ACL_INTERNAL_TEST_CKW_IN_DF const IGpuCkwComponentDriver *ckw_component_driver() const override; #endif // ACL_INTERNAL_TEST_CKW_IN_DF @@ -127,7 +126,7 @@ public: private: #ifndef ACL_INTERNAL_TEST_CKW_IN_DF std::unique_ptr _component_writer; -#else // ACL_INTERNAL_TEST_CKW_IN_DF +#else // ACL_INTERNAL_TEST_CKW_IN_DF std::unique_ptr _component_writer; #endif // ACL_INTERNAL_TEST_CKW_IN_DF }; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp index 12b81c3d56..dcbecaff35 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp @@ -38,25 +38,19 @@ namespace experimental { namespace dynamic_fusion { -Status ClComponentStore::validate( - const Properties &properties, - const ArgumentPack &tensors) +Status ClComponentStore::validate(const Properties &properties, const ArgumentPack &tensors) { ARM_COMPUTE_UNUSED(properties, tensors); return Status{}; } -ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack &tensors) - : IGpuKernelComponent{ id, properties, tensors }, +ClComponentStore::ClComponentStore(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors) + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors) -} + _component_writer{std::make_unique(id, tensors)} #else //ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors) -} + _component_writer{std::make_unique(id, tensors)} #endif //ACL_INTERNAL_TEST_CKW_IN_DF { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h index 853ee39012..948785c480 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + #include namespace arm_compute @@ -70,9 +71,7 @@ public: * |:--------------|:--------------| * |All |All | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors); + static Status validate(const Properties &properties, const ArgumentPack &tensors); /** Constructor * * Similar to @ref ClComponentStore::validate() diff --git a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h index bc7133f4df..4c3e84e59d 100644 --- a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h @@ -46,18 +46,16 @@ using namespace experimental::dynamic_fusion; */ inline ::std::ostream &operator<<(::std::ostream &os, const ClComponentElementwiseBinary::Attributes::ElementwiseOp &op) { - const std::map op_name = - { - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub" } - }; + const std::map op_name = { + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub"}}; os << op_name.at(op); return os; } diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp index e7ee1c10df..2cec67dc65 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h" + #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" @@ -32,12 +33,11 @@ namespace experimental { namespace dynamic_fusion { -Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, + DataType::S16, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); // Set the elementwise operation to Add then call the elementwise common validate_op @@ -46,12 +46,11 @@ Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes); } -Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, + DataType::S16, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); // Set the elementwise operation to Add then call the elementwise common is_supported_op @@ -60,9 +59,7 @@ Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes); } -ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *lhs, - ITensorInfo *rhs) +ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs) { // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op() // Set the elementwise operation to Add then call the elementwise common create_op diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp index 33c2d43e07..6f35e66ea8 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp @@ -23,12 +23,11 @@ */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h" +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h" - -#include "src/common/utils/Log.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute { @@ -49,7 +48,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -58,25 +57,22 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, // Check support level // Data Type - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16, - DataType::U16, DataType::U32, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16, - DataType::U16, DataType::U32, DataType::S32, DataType::F16, - DataType::F32); - - if(context.gpu_language() == GpuLanguage::OpenCL) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, + DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::U8, DataType::S8, + DataType::QASYMM8, DataType::S16, DataType::U16, DataType::U32, + DataType::S32, DataType::F16, DataType::F32); + + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); // Validate Cast Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); - auto settings = ClComponentCast::Settings(); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentCast::Settings(); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); @@ -94,16 +90,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -Status GpuCast::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src, - const CastAttributes &attributes) +Status +GpuCast::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const CastAttributes &attributes) { return is_supported_op_helper(context, src, nullptr, attributes); } -Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src, - const CastAttributes &attributes) +Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const CastAttributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); @@ -127,9 +120,7 @@ Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); } -ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - const CastAttributes &attributes) +ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const CastAttributes &attributes) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, attributes); @@ -145,14 +136,15 @@ ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); const auto *sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr); // Add Depthwise Conv2d Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); - auto settings = ClComponentCast::Settings(); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentCast::Settings(); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp index 89b533c9b8..697b7d4e1f 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp @@ -25,14 +25,13 @@ #include "arm_compute/core/experimental/Types.h" +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace experimental @@ -48,12 +47,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(), "Maximum clamp value cannot be lower than minimum value"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(), + "Maximum clamp value cannot be lower than minimum value"); TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -61,16 +61,15 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, auto_init_if_empty(dst_info_to_validate, *src->clone()); // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped - const ClComponentActivation::Attributes act_info - { - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, attributes.max_val(), attributes.min_val() - }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + attributes.max_val(), attributes.min_val()}; // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { // Validate Activation Component - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); @@ -87,16 +86,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -Status GpuClamp::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src, - const ClampAttributes &attributes) +Status +GpuClamp::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ClampAttributes &attributes) { return is_supported_op_helper(context, src, nullptr, attributes); } -Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src, - const ClampAttributes &attributes) +Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ClampAttributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -121,9 +117,7 @@ Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); } -ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - const ClampAttributes &attributes) +ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const ClampAttributes &attributes) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, attributes); @@ -139,18 +133,16 @@ ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped - const ClComponentActivation::Attributes act_info - { - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, attributes.max_val(), attributes.min_val() - }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + attributes.max_val(), attributes.min_val()}; const auto *const sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { // Add Activation Component auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp index cb270ed4b0..aaeec543f8 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp @@ -24,15 +24,15 @@ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" #include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h" #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" @@ -45,24 +45,30 @@ namespace dynamic_fusion { namespace { -DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo +config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) { // Get GPU target GPUTarget gpu_target = CLScheduler::get().target(); - std::unique_ptr t = arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target); + std::unique_ptr t = + arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target); return t->configure(src, weights, conv_info); } -void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const Conv2dAttributes &attributes) +void calculate_and_init_dst_if_empty(ITensorInfo *dst, + const ITensorInfo *src, + const ITensorInfo *wei, + const Conv2dAttributes &attributes) { - if(dst->total_size() == 0U) + if (dst->total_size() == 0U) { - const auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), src->data_layout(), wei->tensor_shape(), - PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, - attributes.pad().right, - attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType + const auto shape = misc::shape_calculator::compute_deep_convolution_shape( + src->tensor_shape(), src->data_layout(), wei->tensor_shape(), + PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, + DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); } @@ -83,7 +89,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -98,18 +104,20 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, // Check components const auto gpu_target = context.gpu_target(); - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); // Validate Direct Conv2d Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); - auto settings = ClComponentDirectConv2d::Settings(); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentDirectConv2d::Settings(); settings.fast_relaxed_math( - (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16)); + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (dst_info_to_validate_ptr->data_type() == DataType::F32 || + dst_info_to_validate_ptr->data_type() == DataType::F16)); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); @@ -142,14 +150,14 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *wei, const ITensorInfo *bia, - const Conv2dAttributes &attributes) + const Conv2dAttributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!wei->are_values_constant(), "Dynamic weights are not supported"); // Check if tensors have valid id. I.e. they are created from a sketch ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id()); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id()); } @@ -178,16 +186,13 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes); } -ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - ITensorInfo *wei, - ITensorInfo *bia, - const Conv2dAttributes &attributes) +ITensorInfo *GpuConv2d::create_op( + GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Conv2dAttributes &attributes) { ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes); PadStrideInfo conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, - attributes.pad().right, - attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR); + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, + DimensionRoundingType::FLOOR); // Initialize the direct convolution descriptor const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, wei, conv_info); @@ -207,7 +212,7 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch &sketch, const auto gpu_target = sketch_ctx->gpu_target(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr); @@ -216,17 +221,17 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch &sketch, // Add Direct Conv2d Component { auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); auto settings = ClComponentDirectConv2d::Settings(); settings.fast_relaxed_math( - (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16)); + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16)); settings.direct_conv_descriptor(desc); - if(settings.export_to_cl_image()) + if (settings.export_to_cl_image()) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei); } diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp index c72098e943..e2b673bd43 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp @@ -28,8 +28,8 @@ #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h" #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" @@ -42,20 +42,20 @@ namespace dynamic_fusion { namespace { -void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes) +void calculate_and_init_dst_if_empty(ITensorInfo *dst, + const ITensorInfo *src, + const ITensorInfo *wei, + const DepthwiseConv2dAttributes &attributes) { - if(dst->total_size() == 0U) + if (dst->total_size() == 0U) { - const PadStrideInfo pad_stride_info(attributes.stride().x(), - attributes.stride().y(), - attributes.pad().left, - attributes.pad().right, - attributes.pad().top, - attributes.pad().bottom, + const PadStrideInfo pad_stride_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, attributes.dimension_rounding_type()); - const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() }; - const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); + const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), + attributes.dilation()}; + const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); } @@ -76,7 +76,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -91,40 +91,44 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, const GpuTarget gpu_target = context.gpu_target(); - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const CLCompileContext *cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); // Validate Depthwise Conv2d Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); - auto settings = ClComponentDepthwiseConv2d::Settings(); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentDepthwiseConv2d::Settings(); - const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, - attributes.pad().right, - attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR); + const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), + attributes.pad().left, attributes.pad().right, attributes.pad().top, + attributes.pad().bottom, DimensionRoundingType::FLOOR); // Get the depthwise convolution compute parameters - auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); + auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_info = + t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); settings.fast_relaxed_math( - (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16)); + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (dst_info_to_validate_ptr->data_type() == DataType::F32 || + dst_info_to_validate_ptr->data_type() == DataType::F16)); settings.is_fma_available(get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) - .m0(dwc_info.m0) - .n0(dwc_info.n0) - .export_input_to_cl_image(dwc_info.export_input_to_cl_image) - .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); + .m0(dwc_info.m0) + .n0(dwc_info.n0) + .export_input_to_cl_image(dwc_info.export_input_to_cl_image) + .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); arguments.add_const_tensor(ACL_SRC_1, wei); arguments.add_const_tensor(ACL_SRC_2, bia); arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr); - ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings)); + ARM_COMPUTE_RETURN_ON_ERROR( + ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings)); } } else @@ -158,7 +162,7 @@ Status GpuDepthwiseConv2d::validate_op(const GpuWorkloadSketch &sketch, ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id()); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id()); } @@ -205,35 +209,37 @@ ITensorInfo *GpuDepthwiseConv2d::create_op(GpuWorkloadSketch &sket const auto *sketch_ctx = sketch.implementation().context(); const GpuTarget gpu_target = sketch_ctx->gpu_target(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context()); // Add Depthwise Conv2d Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); - auto settings = ClComponentDepthwiseConv2d::Settings(); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentDepthwiseConv2d::Settings(); - const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, - attributes.pad().right, - attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR); + const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), + attributes.pad().left, attributes.pad().right, attributes.pad().top, + attributes.pad().bottom, DimensionRoundingType::FLOOR); // Get the depthwise convolution compute parameters - auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); + auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_info = + t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) - .m0(dwc_info.m0) - .n0(dwc_info.n0) - .export_input_to_cl_image(dwc_info.export_input_to_cl_image) - .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); + .m0(dwc_info.m0) + .n0(dwc_info.n0) + .export_input_to_cl_image(dwc_info.export_input_to_cl_image) + .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); - if(settings.export_input_to_cl_image()) + if (settings.export_input_to_cl_image()) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(src); } - if(settings.export_weights_to_cl_image()) + if (settings.export_weights_to_cl_image()) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei); } diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp index 464a32cbad..b871171e8d 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h" + #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" @@ -32,9 +33,7 @@ namespace experimental { namespace dynamic_fusion { -Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); @@ -46,9 +45,7 @@ Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes); } -Status GpuMul::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuMul::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); @@ -60,9 +57,7 @@ Status GpuMul::is_supported_op(const GpuWorkloadContext &context, return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes); } -ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *lhs, - ITensorInfo *rhs) +ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs) { // Set the elementwise operation to Mul then call the elementwise common create_op ElementwiseBinaryCommonAttributes common_attributes{}; diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp index 107a5e5fa7..f0d368d757 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp @@ -26,10 +26,9 @@ #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" - #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/utils/Utils.h" namespace arm_compute @@ -43,9 +42,7 @@ namespace constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src, - const ITensorInfo *dst) +Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); @@ -60,9 +57,7 @@ Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, return Status{}; } -Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src, - const ITensorInfo *dst) +Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); @@ -90,9 +85,7 @@ Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, return status; } -void GpuOutput::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - ITensorInfo *dst) +void GpuOutput::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst) { ARM_COMPUTE_LOG_PARAMS(src, dst); ARM_COMPUTE_ERROR_THROW_ON(GpuOutput::validate_op(sketch, src, dst)); @@ -104,14 +97,14 @@ void GpuOutput::create_op(GpuWorkloadSketch &sketch, auto &comp_graph = sketch.implementation().component_graph(); const auto sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr); // Add store component { IGpuKernelComponent::Properties properties; - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp index 7ecfa0158b..55c604aacc 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp @@ -22,20 +22,21 @@ * SOFTWARE. */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h" + #include "arm_compute/core/CL/CLCompileContext.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h" - #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" -#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h" #include "src/dynamic_fusion/utils/Utils.h" namespace arm_compute @@ -46,11 +47,15 @@ namespace dynamic_fusion { namespace { -void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const Pool2dAttributes &attributes, const GpuPool2dSettings &settings) +void calculate_and_init_dst_if_empty(ITensorInfo *dst, + const ITensorInfo *src, + const Pool2dAttributes &attributes, + const GpuPool2dSettings &settings) { - if(dst->total_size() == 0U) + if (dst->total_size() == 0U) { - auto shape = misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())); + auto shape = misc::shape_calculator::compute_pool_shape( + *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); } } @@ -82,7 +87,7 @@ bool GpuPool2dSettings::use_inf_as_limit() const Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, - const Pool2dAttributes &attributes, + const Pool2dAttributes &attributes, const GpuPool2dSettings &settings) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -110,7 +115,7 @@ Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch, Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Pool2dAttributes &attributes, - const GpuPool2dSettings &settings) + const GpuPool2dSettings &settings) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); // Data type @@ -118,7 +123,8 @@ Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context, // Data layout ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); // Check exclude padding is not false - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(), "Exclude padding must be set to true in Attributes!"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(), + "Exclude padding must be set to true in Attributes!"); // Auto initialize dst tensor info TensorInfo dst_info_to_validate; @@ -126,14 +132,15 @@ Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context, calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings); // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); // Validate Component { - const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const KernelProperties properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); @@ -148,10 +155,10 @@ Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context, return Status{}; } -ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - const Pool2dAttributes &attributes, - const GpuPool2dSettings &settings) +ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *src, + const Pool2dAttributes &attributes, + const GpuPool2dSettings &settings) { // Assert validation ARM_COMPUTE_ERROR_THROW_ON(GpuPool2d::validate_op(sketch, src, attributes, settings)); @@ -168,7 +175,7 @@ ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch &sketch, const auto sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); ARM_COMPUTE_UNUSED(cl_compile_ctx); @@ -177,7 +184,7 @@ ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch &sketch, // Add Component { auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp index 0f43a578df..3def7a1a81 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp @@ -22,12 +22,14 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h" + #include "arm_compute/core/Error.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute { @@ -40,14 +42,14 @@ namespace Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst, - const ReshapeAttributes &attributes) + const ReshapeAttributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -55,7 +57,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(attributes.shape())); // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); @@ -78,16 +80,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, GpuOperatorType operator_type = GpuOperatorType::Complex; } // namespace -Status GpuReshape::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src, - const Attributes &attributes) +Status +GpuReshape::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes) { return is_supported_op_helper(context, src, nullptr, attributes); } -Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src, - const Attributes &attributes) +Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); @@ -111,9 +110,7 @@ Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); } -ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - const Attributes &attributes) +ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, attributes.shape()); @@ -127,7 +124,7 @@ ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, // Translate into components and add to component graph auto &comp_graph = sketch.implementation().component_graph(); const auto sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); ARM_COMPUTE_UNUSED(cl_compile_ctx); @@ -136,7 +133,7 @@ ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, // Add ElementwiseBinary Component { auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp index 5f52eea7d0..fb09875b33 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp @@ -26,12 +26,12 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h" - -#include "src/common/utils/Log.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute { @@ -43,7 +43,7 @@ namespace { void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ResizeAttributes &attributes) { - if(dst->total_size() == 0U) + if (dst->total_size() == 0U) { TensorShape out_shape = src->tensor_shape(); @@ -64,7 +64,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -73,22 +73,25 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, // Check support level // Data type - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::U8, DataType::S16, DataType::F16, DataType::F32); // Data layout ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); // Interpolation policy - ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR && attributes.interpolation_policy() != InterpolationPolicy::BILINEAR, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR && + attributes.interpolation_policy() != InterpolationPolicy::BILINEAR, "Interpolation policy must be NEAREST_NEIGHBOR or BILINEAR"); // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); // Validate Activation Component { - const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const KernelProperties properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); @@ -107,16 +110,14 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, constexpr GpuOperatorType operator_type = GpuOperatorType::Complex; } // namespace -Status GpuResize::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src, - const Attributes &attributes) +Status +GpuResize::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes) { return is_supported_op_helper(context, src, nullptr, attributes); } -Status GpuResize::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src, - const GpuResize::Attributes &attributes) +Status +GpuResize::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const GpuResize::Attributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); @@ -141,9 +142,7 @@ Status GpuResize::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); } -ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - const GpuResize::Attributes &attributes) +ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const GpuResize::Attributes &attributes) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, attributes); @@ -159,13 +158,14 @@ ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); const auto *sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context()); // Add Resize Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp index 09debad969..a2260c8c36 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp @@ -23,14 +23,15 @@ */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h" + #include "arm_compute/core/experimental/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute { @@ -40,9 +41,7 @@ namespace dynamic_fusion { namespace { -Status is_supported_op_helper(const GpuWorkloadContext &context, - const ITensorInfo *src, - const ITensorInfo *dst) +Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); @@ -50,20 +49,21 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } auto_init_if_empty(dst_info_to_validate, *src->clone()); - const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::LOGISTIC }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC}; // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { // Validate Activation Component - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); @@ -80,14 +80,12 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src) +Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src) { return is_supported_op_helper(context, src, nullptr); } -Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src) +Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -112,8 +110,7 @@ Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate); } -ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src) +ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src); @@ -128,15 +125,15 @@ ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, // Translate into components and add to component graph GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); - const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::LOGISTIC }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC}; const auto *const sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { // Add Activation Component auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp index ffc4553a7d..c87b282aec 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp @@ -22,13 +22,14 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h" + #include "arm_compute/core/Error.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h" #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" @@ -52,7 +53,7 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; // Auto initialize dst tensor info - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate = *dst; } @@ -61,11 +62,12 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context, auto_init_if_empty(dst_info_to_validate, *src->clone()); } // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); - const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const KernelProperties properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); TensorShape logits_sum_shape = src->tensor_shape(); TensorInfo logits(src->clone()->set_tensor_shape(logits_sum_shape)); @@ -86,7 +88,8 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context, arguments_norm.add_const_tensor(ACL_SRC_1, &sum); arguments_norm.add_const_tensor(ACL_DST_0, &dst_info_to_validate); - ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes)); + ARM_COMPUTE_RETURN_ON_ERROR( + ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes)); ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DNorm::validate(properties, arguments_norm, attributes)); } else @@ -105,14 +108,16 @@ Status GpuSoftmax::validate_op(const GpuWorkloadSketch &sketch, ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !dst->has_valid_id()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported"); - ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast(-src->num_dimensions()) || static_cast(src->num_dimensions()) <= attributes.axis()); + ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast(-src->num_dimensions()) || + static_cast(src->num_dimensions()) <= attributes.axis()); // Auto initialize dst tensor info TensorInfo dst_info_to_validate = *dst; auto_init_if_empty(dst_info_to_validate, *src->clone()); - const size_t actual_axis = static_cast(wrap_around(attributes.axis(), static_cast(src->num_dimensions()))); - const bool needs_permute = actual_axis != 0; + const size_t actual_axis = + static_cast(wrap_around(attributes.axis(), static_cast(src->num_dimensions()))); + const bool needs_permute = actual_axis != 0; ARM_COMPUTE_RETURN_ERROR_ON_MSG(needs_permute, "Dynamic fusion softmax on axis!=0 not supported yet."); // Perform fusion test and check if the operator meets the fusion constraints @@ -128,17 +133,16 @@ Status GpuSoftmax::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); } -void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - ITensorInfo *dst, - const Attributes &attributes) +void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst, const Attributes &attributes) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_LOG_PARAMS(src, dst, attributes); TensorShape logits_sum_shape = src->tensor_shape(); - ITensorInfo *logits = sketch.implementation().create_auxiliary_tensor(src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape)); + ITensorInfo *logits = sketch.implementation().create_auxiliary_tensor( + src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape)); logits_sum_shape.set(0, 1); - ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor(src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape)); + ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor( + src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape)); // Auto initialize dst tensor info and the auxiliary tensor infos as well auto_init_if_empty(*dst, *src->clone()); @@ -151,7 +155,7 @@ void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, auto &comp_graph = sketch.implementation().component_graph(); const auto sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); ARM_COMPUTE_UNUSED(cl_compile_ctx); @@ -160,7 +164,7 @@ void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, // Add Direct Conv2d Component { auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments_exp_sum; ArgumentPack arguments_norm; diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp index 8240008f2a..e5d62c9930 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h" + #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" @@ -32,12 +33,11 @@ namespace experimental { namespace dynamic_fusion { -Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, + DataType::S16, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); // Set the elementwise operation to Sub then call the elementwise common validate_op @@ -46,12 +46,11 @@ Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes); } -Status GpuSub::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuSub::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, + DataType::S16, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); // Set the elementwise operation to Sub then call the elementwise common is_supported_op @@ -60,9 +59,7 @@ Status GpuSub::is_supported_op(const GpuWorkloadContext &context, return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes); } -ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *lhs, - ITensorInfo *rhs) +ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs) { // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op() // Set the elementwise operation to Sub then call the elementwise common create_op diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp index c00716c76e..bf0f274c5c 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp @@ -23,14 +23,15 @@ */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h" + #include "arm_compute/core/experimental/Types.h" +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/common/utils/Log.h" namespace arm_compute { @@ -40,9 +41,7 @@ namespace dynamic_fusion { namespace { -Status is_supported_op_helper(const GpuWorkloadContext &context, - const ITensorInfo *src, - const ITensorInfo *dst) +Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); @@ -50,20 +49,21 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } auto_init_if_empty(dst_info_to_validate, *src->clone()); - const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::TANH }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH}; // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { // Validate Activation Component - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); @@ -80,14 +80,12 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -Status GpuTanh::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src) +Status GpuTanh::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src) { return is_supported_op_helper(context, src, nullptr); } -Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src) +Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -112,8 +110,7 @@ Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate); } -ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src) +ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src); @@ -128,15 +125,15 @@ ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, // Translate into components and add to component graph GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); - const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::TANH }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH}; const auto *const sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { // Add Activation Component auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp index 7c087c9a7b..d79a4c42c9 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp @@ -22,11 +22,12 @@ * SOFTWARE. */ #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute { @@ -38,9 +39,10 @@ namespace { void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *lhs, const ITensorInfo *rhs) { - if(dst->total_size() == 0U) + if (dst->total_size() == 0U) { - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs); + const std::pair broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs); auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(broadcast_pair.first)); } } @@ -56,7 +58,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -64,7 +66,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs); // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); @@ -90,7 +92,8 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -ElementwiseBinaryCommonAttributes &ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation) +ElementwiseBinaryCommonAttributes & +ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation) { _operation = operation; return *this; @@ -157,14 +160,14 @@ ITensorInfo *GpuElementwiseBinaryCommon::create_op(GpuWorkloadSketch const auto sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context()); // Add ElementwiseBinary Component { auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, lhs); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp index 0972b4e8e2..775b0a0c8c 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "GpuKernelVariableTable.h" + #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/ITensorInfo.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" namespace arm_compute @@ -32,14 +34,17 @@ namespace experimental { namespace dynamic_fusion { -void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias) +void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, + const ITensorInfo *tensor, + GpuKernelArgumentInfo argument_info, + const std::string &alias) { ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected"); // Do not re-declare if the variable associated with the tensor has already been declared auto it = _vars.find(tensor->id()); - if(it != _vars.end()) + if (it != _vars.end()) { ARM_COMPUTE_ERROR_ON(!(it->second.kernel_argument_info == argument_info)); return; @@ -47,14 +52,12 @@ void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &com const auto target = comp_group.get_tile_for_tensor(tensor); - if(target != tensor) + if (target != tensor) { // If the tensor uses a shared tile, don't declare another variable. it = _vars.find(target->id()); - ARM_COMPUTE_ERROR_ON_MSG( - it == _vars.end(), - "The variable used for this tensor must have been declared."); + ARM_COMPUTE_ERROR_ON_MSG(it == _vars.end(), "The variable used for this tensor must have been declared."); _vars[tensor->id()] = it->second; } @@ -64,7 +67,7 @@ void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &com std::stringstream ss; ss << alias << "_t" << abs(tensor->id()); const auto uniq_name = ss.str(); - TensorVariable var{ tensor->id(), uniq_name, argument_info }; + TensorVariable var{tensor->id(), uniq_name, argument_info}; _vars.emplace(tensor->id(), var); } @@ -76,12 +79,13 @@ GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(cons return var; } -GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector &tensors) const +GpuKernelVariableTable::VariableList +GpuKernelVariableTable::get_variable_list(const std::vector &tensors) const { VariableList vars{}; - for(const auto &tensor : tensors) + for (const auto &tensor : tensors) { - if(!tensor->has_valid_id()) + if (!tensor->has_valid_id()) { continue; } @@ -90,23 +94,19 @@ GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(c return vars; } -TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var) - : value{ var.uniq_name } +TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var) : value{var.uniq_name} { } -TagVal::TagVal(const std::string &val) - : value{ val } +TagVal::TagVal(const std::string &val) : value{val} { } -TagVal::TagVal(const char *val) - : value{ std::string(val) } +TagVal::TagVal(const char *val) : value{std::string(val)} { } -TagVal::TagVal(const DataType &data_type) - : value{ get_cl_type_from_data_type(data_type) } +TagVal::TagVal(const DataType &data_type) : value{get_cl_type_from_data_type(data_type)} { } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h index a49d38e10c..c17f131ada 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE #include "arm_compute/core/ITensorInfo.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" #include "support/AclRequires.h" #include "support/StringSupport.h" @@ -55,11 +56,11 @@ public: struct TensorVariable { public: - TensorVariable() = default; - TensorVariable(const TensorVariable &) = default; + TensorVariable() = default; + TensorVariable(const TensorVariable &) = default; TensorVariable &operator=(const TensorVariable &) = default; - ITensorInfo::Id id{ ITensorInfo::invalid_tensor_id }; - std::string uniq_name{ "empty" }; // Unique name, also the final variable name used in the built code + ITensorInfo::Id id{ITensorInfo::invalid_tensor_id}; + std::string uniq_name{"empty"}; // Unique name, also the final variable name used in the built code GpuKernelArgumentInfo kernel_argument_info{}; bool has_valid_id() const { @@ -76,7 +77,10 @@ public: * @param[in] argument_info Kernel argument information * @param[in] alias Alias for the variable. Will be used as part of the variable name */ - void declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias = "unnamed"); + void declare_variable(const GpuKernelComponentGroup &comp_group, + const ITensorInfo *tensor, + GpuKernelArgumentInfo argument_info, + const std::string &alias = "unnamed"); /** Get the @ref TensorVariable associated with @p tensor * * @param[in] tensor Tensor info to be queried @@ -106,8 +110,7 @@ struct TagVal TagVal(const GpuKernelVariableTable::TensorVariable &var); /** Construct a @ref TagVal from an integral type */ template ::value)> - TagVal(T val) - : value{ support::cpp11::to_string(val) } + TagVal(T val) : value{support::cpp11::to_string(val)} { } /** Construct a @ref TagVal from a string */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h index 4a1fb142d6..9d0b4f592a 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Window.h" + #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/components/Types.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" @@ -57,8 +58,7 @@ public: * @param[in] id Component id * @param[in] tensors Tensor arguments to the components */ - IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack &tensors) - : _id{ id }, _tensors{ tensors } + IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack &tensors) : _id{id}, _tensors{tensors} { } /** Destructor */ @@ -112,7 +112,7 @@ public: /** Generate the header list used in the component */ virtual std::set get_headers_list() const { - return std::set {}; + return std::set{}; } /** Generate the execution window for the component */ virtual Window get_window() const @@ -131,7 +131,7 @@ public: } private: - ComponentId _id{ -1 }; + ComponentId _id{-1}; ArgumentPack _tensors{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp index 3c7c843dd8..c165fb5f33 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" @@ -39,10 +40,7 @@ namespace dynamic_fusion ClTemplateActivation::ClTemplateActivation(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST); @@ -62,7 +60,7 @@ std::string ClTemplateActivation::get_component_code(const ComponentGroup &comp_ code = R"_( //------------------ START KERNEL {{meta_kernel_id}} --------------------- )_"; - if(is_root) + if (is_root) { code += R"_( // IN(src) {{src}} @@ -104,17 +102,11 @@ LOOP_UNROLLING(int, i, 0, 1, M0, void ClTemplateActivation::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -173,7 +165,7 @@ std::string ClTemplateActivation::get_config_id() const std::set ClTemplateActivation::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h", "activation_float_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h", "activation_float_helpers.h"}; } Window ClTemplateActivation::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h index ec78cf6ce5..88ee370342 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h @@ -26,6 +26,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp index 4956879ad3..0da3a73801 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" @@ -35,7 +36,7 @@ namespace experimental namespace dynamic_fusion { ClTemplateCast::ClTemplateCast(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -62,7 +63,7 @@ std::string ClTemplateCast::get_component_code(const ComponentGroup &comp_group) //------------------ START KERNEL {{meta_kernel_id}} CAST --------------------- )_"; - if(is_root) + if (is_root) { code += R"_( // IN_0(src) {{src}} @@ -82,14 +83,15 @@ TILE(uint, M0, 1, g_dst_indirect_y); { )_"; - if(kernel_name == "cast_down" && is_data_type_quantized(_src->data_type())) + if (kernel_name == "cast_down" && is_data_type_quantized(_src->data_type())) { code += R"_( {{tmp}}[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80; )_"; } - if(kernel_name == "cast_down" && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) + if (kernel_name == "cast_down" && + (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) { code += R"_( {{dst}}[m0].v = CONVERT_SAT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0)); @@ -106,7 +108,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); }) )_"; - if(is_root) + if (is_root) { code += R"_( LOOP_UNROLLING(int, i, 0, 1, M0, @@ -128,17 +130,11 @@ TILE(uint, M0, 1, g_dst_indirect_y); void ClTemplateCast::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -199,7 +195,7 @@ std::string ClTemplateCast::get_config_id() const std::set ClTemplateCast::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateCast::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp index ab7cc9f05a..8380620ab2 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp @@ -36,17 +36,17 @@ ClTemplateDepthwiseConv2d::ClTemplateDepthwiseConv2d(ComponentId const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuTemplateComponentWriter{ id, tensors }, + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _weight{}, _bias{}, _dst{}, - _attributes{ attributes }, - _settings{ settings } + _attributes{attributes}, + _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); - if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) { _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); } @@ -71,7 +71,7 @@ std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup & // IN_1(wei) {{weight}} )_"; - if(_bias != nullptr && _bias->has_valid_id()) + if (_bias != nullptr && _bias->has_valid_id()) { code += R"_( // IN_1(bia) {{bias}} @@ -113,7 +113,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); }) )_"; - if(_weight->dimension(height_idx) < 5) + if (_weight->dimension(height_idx) < 5) { code += R"_( LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, @@ -147,7 +147,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); { )_"; - if(!_settings.is_fma_available()) + if (!_settings.is_fma_available()) { code += R"_( {{dst}}[m0].v += a[xk + m0].v * b[xk].v; @@ -166,14 +166,14 @@ TILE(uint, M0, 1, g_dst_indirect_y); } )_"; - if(_weight->dimension(height_idx) < 5) + if (_weight->dimension(height_idx) < 5) { code += R"_( ) )_"; } - if(_bias && _bias->has_valid_id()) + if (_bias && _bias->has_valid_id()) { code += R"_( TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}}); @@ -198,44 +198,31 @@ TILE(uint, M0, 1, g_dst_indirect_y); return code; } -void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { - const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() ? - GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : - GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; - - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(input_type), - "src"); - - const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ? - GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : - GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; - - vtable.declare_variable( - comp_group, - _weight, - GpuKernelArgumentInfo(weight_type), - "weight"); - - if(_bias != nullptr && _bias->has_valid_id()) // optional bias + const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() + ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image + : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(input_type), "src"); + + const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() + ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image + : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + + vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight"); + + if (_bias != nullptr && _bias->has_valid_id()) // optional bias { - vtable.declare_variable( - comp_group, - _bias, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), - "bias"); + vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias"); } - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } -TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { TagLUT lut{}; @@ -243,7 +230,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab lut["src"] = vtable.get_variable(_src); lut["weight"] = vtable.get_variable(_weight); - if(_bias != nullptr && _bias->has_valid_id()) // optional bias + if (_bias != nullptr && _bias->has_valid_id()) // optional bias { lut["bias"] = vtable.get_variable(_bias); lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type()); @@ -259,7 +246,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab lut["SRC_DATA_TYPE"] = _src->data_type(); lut["WEI_DATA_TYPE"] = _weight->data_type(); - switch(vtable.get_variable(_src).kernel_argument_info.type) + switch (vtable.get_variable(_src).kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: @@ -271,7 +258,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab break; } - switch(vtable.get_variable(_weight).kernel_argument_info.type) + switch (vtable.get_variable(_weight).kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: @@ -318,7 +305,7 @@ CLBuildOptions ClTemplateDepthwiseConv2d::get_build_options(const ComponentGroup CLBuildOptions build_opts{}; - if(_settings.fast_relaxed_math()) + if (_settings.fast_relaxed_math()) { build_opts.add_option("-cl-fast-relaxed-math"); } @@ -361,7 +348,7 @@ std::string ClTemplateDepthwiseConv2d::get_config_id() const std::set ClTemplateDepthwiseConv2d::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateDepthwiseConv2d::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h index 84b689ef64..5d04c687c3 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index 3322487910..f6a7a58d1d 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -23,14 +23,13 @@ */ #include "ClTemplateDirectConv2d.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" - #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" -#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -43,17 +42,17 @@ ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuTemplateComponentWriter{ id, tensors }, + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _weight{}, _bias{}, _dst{}, - _attributes{ attributes }, - _settings{ settings } + _attributes{attributes}, + _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); - if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) { _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); } @@ -79,7 +78,7 @@ std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &com // IN_0(src) {{src}} // IN_1(wei) {{weight}} )_"; - if(_bias && _bias->has_valid_id()) + if (_bias && _bias->has_valid_id()) { code += R"_( // IN_1(bia) {{bias}} @@ -161,7 +160,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); } )_"; - if(leftover_loop) + if (leftover_loop) { code += R"_( for(; ck < _ISRC_CHANNELS; ++ck) @@ -186,9 +185,9 @@ TILE(uint, M0, 1, g_dst_indirect_y); T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}}); } )_"; -} + } -code += R"_( + code += R"_( #undef _I_WEI_WIDTH #undef _I_WEI_HEIGHT #undef _ISRC_WIDTH @@ -202,7 +201,7 @@ code += R"_( } )_"; - if(_bias && _bias->has_valid_id()) + if (_bias && _bias->has_valid_id()) { code += R"_( TILE({{BIA_DATA_TYPE}}, 1, N0, bias0); @@ -211,9 +210,9 @@ code += R"_( T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}}); )_"; -} + } -code += R"_( + code += R"_( LOOP_UNROLLING(int, i, 0, 1, M0, { g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1); @@ -227,32 +226,19 @@ code += R"_( void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; - vtable.declare_variable( - comp_group, - _weight, - GpuKernelArgumentInfo(weight_type), - "weight"); - - if(_bias && _bias->has_valid_id()) // optional bias + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() + ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image + : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight"); + + if (_bias && _bias->has_valid_id()) // optional bias { - vtable.declare_variable( - comp_group, - _bias, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), - "bias"); + vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias"); } - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(common_tensor_type), - "dst"); + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst"); } TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -262,7 +248,7 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, lut["src"] = vtable.get_variable(_src); lut["weight"] = vtable.get_variable(_weight); - if(_bias && _bias->has_valid_id()) // optional bias + if (_bias && _bias->has_valid_id()) // optional bias { lut["bias"] = vtable.get_variable(_bias); lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type()); @@ -279,34 +265,34 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, lut["WEI_DATA_TYPE"] = _weight->data_type(); lut["SRC_TENSOR_TYPE"] = "BUFFER"; - switch(vtable.get_variable(_weight).kernel_argument_info.type) + switch (vtable.get_variable(_weight).kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: - { - lut["WEI_TENSOR_TYPE"] = "IMAGE"; - break; - } + { + lut["WEI_TENSOR_TYPE"] = "IMAGE"; + break; + } default: - { - lut["WEI_TENSOR_TYPE"] = "BUFFER"; - break; - } + { + lut["WEI_TENSOR_TYPE"] = "BUFFER"; + break; + } } - const auto width_idx = 1; - const auto height_idx = 2; + const auto width_idx = 1; + const auto height_idx = 2; const auto channel_idx = 0; - lut["SRC_WIDTH"] = _src->dimension(width_idx); - lut["SRC_HEIGHT"] = _src->dimension(height_idx); + lut["SRC_WIDTH"] = _src->dimension(width_idx); + lut["SRC_HEIGHT"] = _src->dimension(height_idx); lut["SRC_CHANNELS"] = _src->dimension(channel_idx); - lut["WEI_WIDTH"] = _weight->dimension(width_idx); - lut["WEI_HEIGHT"] = _weight->dimension(height_idx); + lut["WEI_WIDTH"] = _weight->dimension(width_idx); + lut["WEI_HEIGHT"] = _weight->dimension(height_idx); - lut["DST_WIDTH"] = _dst->dimension(width_idx); - lut["DST_HEIGHT"] = _dst->dimension(height_idx); + lut["DST_WIDTH"] = _dst->dimension(width_idx); + lut["DST_HEIGHT"] = _dst->dimension(height_idx); lut["DST_CHANNELS"] = _dst->dimension(channel_idx); lut["STRIDE_X"] = _attributes.stride().x(); @@ -324,14 +310,14 @@ CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &c { const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); - const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); - const unsigned int n0 = root_window.x().step(); - const unsigned int m0 = root_window.y().step(); - const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)); + const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); + const unsigned int n0 = root_window.x().step(); + const unsigned int m0 = root_window.y().step(); + const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)); const unsigned int partial_store_n0 = _dst->dimension(0) % n0; CLBuildOptions build_opts{}; - if(_settings.fast_relaxed_math()) + if (_settings.fast_relaxed_math()) { build_opts.add_option("-cl-fast-relaxed-math"); } @@ -379,7 +365,7 @@ std::string ClTemplateDirectConv2d::get_config_id() const std::set ClTemplateDirectConv2d::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateDirectConv2d::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h index 8988d3ca1c..03c8cd2f15 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h @@ -26,6 +26,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp index c0481ae190..78bff3c3f3 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp @@ -23,14 +23,13 @@ */ #include "ClTemplateElementwiseBinary.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" - #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" -#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -44,11 +43,7 @@ constexpr unsigned int vector_size_byte_opencl = 16; ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _lhs{}, - _rhs{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes} { _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -69,67 +64,67 @@ std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup const bool is_rhs_input = comp_group.is_input_tensor(_rhs); code = -R"_( + R"_( //------------------ START KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} --------------------- )_"; - if(is_root) + if (is_root) { code += -R"_( + R"_( TILE(uint, M0, 1, g_dst_indirect_y); )_"; } - if(is_lhs_input) + if (is_lhs_input) { code += -R"_( + R"_( TILE({{DATA_TYPE}}, {{lhs_m0}}, N0, {{lhs}}); )_"; } - if(is_rhs_input) + if (is_rhs_input) { code += -R"_( + R"_( TILE({{DATA_TYPE}}, {{rhs_m0}}, N0, {{rhs}}); )_"; } code += -R"_( + R"_( { )_"; - if(is_lhs_input) + if (is_lhs_input) { code += -R"_( + R"_( {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_w; T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}}); )_"; } - if(is_rhs_input) + if (is_rhs_input) { code += -R"_( + R"_( {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_w; T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}}); )_"; } code += -R"_( + R"_( T_ELTWISE_{{BROADCAST_OP}}{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}}); )_"; - if(is_root) + if (is_root) { // Calculate the destination indirect Y code += -R"_( + R"_( LOOP_UNROLLING(int, i, 0, 1, M0, { g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1); @@ -139,7 +134,7 @@ R"_( } code += -R"_( + R"_( } //------------------ END KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} --------------------- )_"; @@ -147,28 +142,18 @@ R"_( return code; } -void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _lhs, - GpuKernelArgumentInfo(common_tensor_type), - "lhs"); - - vtable.declare_variable( - comp_group, - _rhs, - GpuKernelArgumentInfo(common_tensor_type), - "rhs"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(common_tensor_type), - "dst"); + vtable.declare_variable(comp_group, _lhs, GpuKernelArgumentInfo(common_tensor_type), "lhs"); + + vtable.declare_variable(comp_group, _rhs, GpuKernelArgumentInfo(common_tensor_type), "rhs"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst"); } -TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { TagLUT lut{}; @@ -182,7 +167,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt lut["dst"] = vtable.get_variable(_dst); lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor()); - switch(_attributes.operation()) + switch (_attributes.operation()) { case Attributes::ElementwiseOp::Add: lut["ELTWISE_OP"] = "ADD"; @@ -197,10 +182,10 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt ARM_COMPUTE_ERROR("Arithmetic Operation not supported"); } - ARM_COMPUTE_ERROR_ON( - comp_group.is_intermediate_tensor(_lhs) && detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0)); - ARM_COMPUTE_ERROR_ON( - comp_group.is_intermediate_tensor(_rhs) && detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0)); + ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_lhs) && + detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0)); + ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_rhs) && + detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0)); // Set broadcast parameters // PRE: All tensors are broadcast-compatible @@ -228,9 +213,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt lut["rhs_m0"] = (rhs_broadcast_yz) ? "1" : "M0"; lut["rhs_start_ind_1"] = (rhs_broadcast_yz) ? "0" : "g_ind_1"; - lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : - (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : - ""; + lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : ""; return lut; } @@ -268,7 +251,7 @@ std::string ClTemplateElementwiseBinary::get_config_id() const std::set ClTemplateElementwiseBinary::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateElementwiseBinary::get_window() const @@ -279,8 +262,9 @@ Window ClTemplateElementwiseBinary::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h index 8cca954efe..991c0eca44 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY #include "arm_compute/core/experimental/Types.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" @@ -48,9 +49,7 @@ public: * @param[in] tensors Tensor arguments to the components * @param[in] attributes Component attributes */ - ClTemplateElementwiseBinary(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + ClTemplateElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); /** Prevent instances of this class from being copy constructed */ ClTemplateElementwiseBinary(const ClTemplateElementwiseBinary &elementwise) = delete; /** Prevent instances of this class from being copied */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp index a8d8d32b12..522c33a022 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" @@ -38,16 +39,12 @@ namespace dynamic_fusion { namespace { - constexpr unsigned int serial_vector_size = 8; +constexpr unsigned int serial_vector_size = 8; } // namespace ClTemplateLogits1DMaxShiftExpSum::ClTemplateLogits1DMaxShiftExpSum(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _sum{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _sum = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -79,7 +76,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component const bool beta_defined = (_attributes.beta() != 1.f); - if(beta_defined) + if (beta_defined) { code += R"_( VEC_TYPE beta = (VEC_TYPE){{BETA}}; @@ -91,7 +88,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component const unsigned int vector_size = adjust_vec_size(_serial_vector_size, reduction_dim_size); const bool non_multiple_of_n0 = ((reduction_dim_size % vector_size) != 0); - if(non_multiple_of_n0) + if (non_multiple_of_n0) { code += R"_( VEC_TYPE data = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr); @@ -111,19 +108,19 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component VEC_TYPE sum1D = 0; )_"; - if(non_multiple_of_n0) + if (non_multiple_of_n0) { code += R"_( data -= max_val; )_"; - if(beta_defined) + if (beta_defined) { code += R"_( data *= beta; )_"; } - if(_attributes.is_log_softmax()) + if (_attributes.is_log_softmax()) { code += R"_( VSTORE_PARTIAL(N0, PARTIAL_N0) @@ -153,14 +150,14 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component data -= max_val; )_"; - if(beta_defined) + if (beta_defined) { code += R"_( data *= beta; )_"; } - if(_attributes.is_log_softmax()) + if (_attributes.is_log_softmax()) { code += R"_( VSTORE(N0) @@ -191,28 +188,18 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component return code; } -void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "src"); - - vtable.declare_variable( - comp_group, - _sum, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "sum"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src"); + + vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst"); } -TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); @@ -241,8 +228,8 @@ CLBuildOptions ClTemplateLogits1DMaxShiftExpSum::get_build_options(const Compone ARM_COMPUTE_UNUSED(comp_group); CLBuildOptions build_opts{}; - const unsigned int reduction_dim_size = _src->dimension(0); - const unsigned int vector_size = adjust_vec_size(serial_vector_size, reduction_dim_size); + const unsigned int reduction_dim_size = _src->dimension(0); + const unsigned int vector_size = adjust_vec_size(serial_vector_size, reduction_dim_size); build_opts.add_option("-DN0=" + support::cpp11::to_string(vector_size)); build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string((reduction_dim_size % vector_size))); @@ -264,7 +251,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_config_id() const std::set ClTemplateLogits1DMaxShiftExpSum::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateLogits1DMaxShiftExpSum::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h index 5d232c0cf2..ac9ddaa9d4 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h @@ -46,7 +46,9 @@ public: * @param[in] tensors Tensor arguments to the components * @param[in] attributes Component attributes */ - ClTemplateLogits1DMaxShiftExpSum(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); + ClTemplateLogits1DMaxShiftExpSum(ComponentId id, + const ArgumentPack &tensors, + const Attributes &attributes); /** Prevent instances of this class from being copy constructed */ ClTemplateLogits1DMaxShiftExpSum(const ClTemplateLogits1DMaxShiftExpSum &) = delete; /** Prevent instances of this class from being copied */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp index 056e570a25..7d7c3e6673 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp @@ -25,6 +25,7 @@ #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" @@ -38,11 +39,7 @@ namespace dynamic_fusion ClTemplateLogits1DNorm::ClTemplateLogits1DNorm(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _sum{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _sum = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -76,7 +73,7 @@ std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &com data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr); )_"; - if(_attributes.is_log_softmax()) + if (_attributes.is_log_softmax()) { code += R"_( sum_val = log(sum_val); @@ -101,23 +98,11 @@ std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &com void ClTemplateLogits1DNorm::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "src"); - - vtable.declare_variable( - comp_group, - _sum, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "sum"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src"); + + vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst"); } TagLUT ClTemplateLogits1DNorm::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -168,14 +153,14 @@ std::string ClTemplateLogits1DNorm::get_config_id() const std::set ClTemplateLogits1DNorm::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateLogits1DNorm::get_window() const { ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); constexpr unsigned int serial_vector_size = 16; - const unsigned int vector_size = adjust_vec_size(serial_vector_size, _src->dimension(0)); + const unsigned int vector_size = adjust_vec_size(serial_vector_size, _src->dimension(0)); Window win = calculate_max_window(*_src, Steps(vector_size)); return win.collapse(win, Window::DimZ); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp index 34840c2100..ebb0374501 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp @@ -23,14 +23,13 @@ */ #include "ClTemplatePool2d.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" -#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -50,11 +49,7 @@ ClTemplatePool2d::ClTemplatePool2d(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes }, - _settings{ settings } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -71,7 +66,7 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou ARM_COMPUTE_UNUSED(comp_group); // Condition to use 2x2 optimized kernel - if(_attributes.pool_size() == Size2D(2, 2)) + if (_attributes.pool_size() == Size2D(2, 2)) { return get_2x2_kernel_code(); } @@ -83,11 +78,13 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou std::string ClTemplatePool2d::get_MxN_kernel_code() const { - const auto pool_type = _attributes.pool_type(); - const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; + const auto pool_type = _attributes.pool_type(); + const bool fp_mixed_precision = + (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; // Define pool op macro. - std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; + std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" + : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; // Kernel start // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0 @@ -129,7 +126,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const )_"; // Determine filter size depending on if padding is excluded or not - if(_attributes.exclude_padding()) + if (_attributes.exclude_padding()) { code += R"_( const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s); @@ -144,7 +141,8 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const // Loop through pool size // if global pooling - if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx)) + if (_attributes.pool_size().x() == _src->dimension(width_idx) && + _attributes.pool_size().y() == _src->dimension(height_idx)) { // Begin loop code += R"_( @@ -173,7 +171,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const // if condition inside loop - use 32bit acc if mixed_precision. // End loop through pooling section. - if(fp_mixed_precision) + if (fp_mixed_precision) { // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE code += R"_( @@ -194,7 +192,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const } // For Pool AVG ONLY, divide pool output by filter size - if(pool_type == PoolingType::AVG) + if (pool_type == PoolingType::AVG) { code += R"_( res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size; @@ -202,7 +200,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const } // If mixed precision convert datatype before storing. Then end kernel. - if(fp_mixed_precision) + if (fp_mixed_precision) { code += R"_( VEC_DATA_TYPE({{DATA_TYPE}}, N0) @@ -228,9 +226,11 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const std::string ClTemplatePool2d::get_2x2_kernel_code() const { - const auto pool_type = _attributes.pool_type(); - const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; - std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; + const auto pool_type = _attributes.pool_type(); + const bool fp_mixed_precision = + (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; + std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" + : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; std::string code = R"_( //------------------ START KERNEL {{meta_kernel_id}} --------------------- @@ -274,7 +274,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0); )_"; - if(fp_mixed_precision) + if (fp_mixed_precision) { // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE code += R"_( @@ -294,7 +294,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const )_"; } - if(pool_type != PoolingType::MAX) + if (pool_type != PoolingType::MAX) { // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound) code += R"_( @@ -321,10 +321,10 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const res0 = POOL_OP(res0, data3); )_"; - if(pool_type == PoolingType::AVG) + if (pool_type == PoolingType::AVG) { // If avg pooling divide result accordingly. - if(_attributes.exclude_padding()) + if (_attributes.exclude_padding()) { code += R"_( res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size; @@ -339,7 +339,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const } // Store result - if(fp_mixed_precision) + if (fp_mixed_precision) { code += R"_( VEC_DATA_TYPE({{DATA_TYPE}}, N0) @@ -365,17 +365,11 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -391,12 +385,15 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const lut["meta_kernel_id"] = id(); // Retrieve relevant data - const auto padding = _attributes.pad(); - const auto stride = _attributes.stride(); - const auto pool_size = _attributes.pool_size(); - const auto data_type = _src->data_type(); - const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX; - const std::string max_initial_value = _settings.use_inf_as_limit() ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits::lowest()); + const auto padding = _attributes.pad(); + const auto stride = _attributes.stride(); + const auto pool_size = _attributes.pool_size(); + const auto data_type = _src->data_type(); + const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && + _attributes.pool_type() != PoolingType::MAX; + const std::string max_initial_value = + _settings.use_inf_as_limit() ? "(-INFINITY)" + : float_to_string_with_full_precision(std::numeric_limits::lowest()); // pool specific lut["STRIDE_X"] = stride.x(); @@ -407,7 +404,8 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const lut["POOL_SIZE_Y"] = pool_size.height; // Datatypes and variables - lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use. + lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type( + (use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use. lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type); lut["SRC_WIDTH"] = _src->dimension(width_idx); lut["SRC_HEIGHT"] = _src->dimension(height_idx); @@ -454,14 +452,14 @@ std::string ClTemplatePool2d::get_config_id() const std::set ClTemplatePool2d::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h", "repeat.h" }; + return std::set{"helpers.h", "tile_helpers.h", "repeat.h"}; } Window ClTemplatePool2d::get_window() const { ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); const auto output_shape = _dst->tensor_shape(); - const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); + const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); // Create and configure kernel window auto win = calculate_max_window(output_shape, Steps(vec_size)); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h index ef1c100f44..d1d3c01669 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h @@ -27,6 +27,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp index 8b50f1e209..c882353fcb 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" @@ -36,11 +37,8 @@ namespace dynamic_fusion { constexpr unsigned int vector_size_byte_opencl = 16; -ClTemplateReshape::ClTemplateReshape(ComponentId id, - const ArgumentPack &tensors) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _dst{} +ClTemplateReshape::ClTemplateReshape(ComponentId id, const ArgumentPack &tensors) + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -97,23 +95,17 @@ TILE(uint, M0, 1, g_dst_indirect_y); void ClTemplateReshape::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(common_tensor_type), - "dst"); + vtable.declare_variable(comp_group, _src, + GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst"); } TagLUT ClTemplateReshape::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); - TagLUT lut{}; + TagLUT lut{}; // Arguments and global shared variables lut["src"] = vtable.get_variable(_src); @@ -153,7 +145,7 @@ std::string ClTemplateReshape::get_config_id() const std::set ClTemplateReshape::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateReshape::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h index 56b6585b61..838a21db6d 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE #include "arm_compute/core/experimental/Types.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" @@ -42,8 +43,7 @@ public: * @param[in] id Component id * @param[in] tensors Tensor arguments to the components */ - ClTemplateReshape(ComponentId id, - const ArgumentPack &tensors); + ClTemplateReshape(ComponentId id, const ArgumentPack &tensors); /** Prevent instances of this class from being copy constructed */ ClTemplateReshape(const ClTemplateReshape &reshape) = delete; /** Prevent instances of this class from being copied */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp index aaed1d990d..846c712ceb 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" @@ -37,8 +38,10 @@ namespace experimental { namespace dynamic_fusion { -ClTemplateResize::ClTemplateResize(ComponentId id, const ArgumentPack &tensors, const ClTemplateResize::Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes } +ClTemplateResize::ClTemplateResize(ComponentId id, + const ArgumentPack &tensors, + const ClTemplateResize::Attributes &attributes) + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -63,9 +66,9 @@ TILE(uint, 1, 1, g_dst_indirect_y); const int bout = g_ind_2 / {{arg_dst}}_h; )_"; - if(_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR) + if (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR) { - if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) + if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) { code += R"_( float xi_f = (g_ind_1 * {{SCALE_X}}); @@ -80,7 +83,7 @@ TILE(uint, 1, 1, g_dst_indirect_y); )_"; } - if(_attributes.align_corners()) + if (_attributes.align_corners()) { code += R"_( xi_f = round(xi_f); @@ -95,9 +98,9 @@ TILE(uint, 1, 1, g_dst_indirect_y); T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, {{dst}}); )_"; } - else if(_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR) + else if (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR) { - if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) + if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) { code += R"_( float xi_f = (g_ind_1 * {{SCALE_X}}); @@ -137,7 +140,7 @@ TILE(uint, 1, 1, g_dst_indirect_y); T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi1, xi1, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in11); )_"; - if(is_data_type_float(_src->data_type())) + if (is_data_type_float(_src->data_type())) { code += R"_( const {{SRC_DATA_TYPE}} a = ({{SRC_DATA_TYPE}})(xi_f - (float)xi); @@ -158,9 +161,9 @@ TILE(uint, 1, 1, g_dst_indirect_y); const float b1 = (1.f - a1); {{dst}}[0].v = CONVERT_SAT( - (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + + (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) + - (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + + (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), VEC_DATA_TYPE({{DST_DATA_TYPE}}, N0)); )_"; } @@ -179,22 +182,18 @@ TILE(uint, 1, 1, g_dst_indirect_y); return code; } -void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const +void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, + const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } -TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const +TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, + const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const { TagLUT lut{}; @@ -212,8 +211,10 @@ TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const lut["DST_DATA_TYPE"] = get_cl_type_from_data_type(_dst->data_type()); lut["CONSTANT_VALUE"] = string_from_pixel_value(0, _src->data_type()); - const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners()); - const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners()); + const float scale_x = + scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners()); + const float scale_y = + scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners()); lut["SCALE_X"] = float_to_string_with_full_precision(scale_x); lut["SCALE_Y"] = float_to_string_with_full_precision(scale_y); @@ -242,7 +243,8 @@ std::string ClTemplateResize::get_config_id() const std::string config_id{}; config_id += "resize_"; - config_id += (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : ""); + config_id += + (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : ""); config_id += (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "BILINEAR" : ""); config_id += "_"; config_id += (_attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft"); @@ -260,7 +262,7 @@ std::string ClTemplateResize::get_config_id() const std::set ClTemplateResize::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateResize::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp index 217214ced3..d0ec91e0a9 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -32,7 +32,7 @@ namespace experimental namespace dynamic_fusion { ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack &tensors) - : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{} + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -61,16 +61,10 @@ std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h index 3f97a82204..b8c82ceadd 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE #include "arm_compute/core/experimental/Types.h" + #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp index eda15f1d95..d3d7c8db83 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp @@ -24,6 +24,7 @@ #include "ClTemplateWriter.h" #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" @@ -39,11 +40,11 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con std::string replaced_code = ""; bool scanning_pattern = false; std::string pattern_found = ""; - for(size_t i = 0; i < code_template.size() - 1; ++i) + for (size_t i = 0; i < code_template.size() - 1; ++i) { - if(!scanning_pattern) + if (!scanning_pattern) { - if(code_template[i] == '{' && code_template[i + 1] == '{') + if (code_template[i] == '{' && code_template[i + 1] == '{') { i += 1; scanning_pattern = true; @@ -56,7 +57,7 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con } else { - if(code_template[i] == '}' && code_template[i + 1] == '}') + if (code_template[i] == '}' && code_template[i + 1] == '}') { i += 1; scanning_pattern = false; @@ -76,8 +77,7 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con ClTemplateWriter::~ClTemplateWriter() { } -ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) - : _components{ components } +ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) : _components{components} { } std::string ClTemplateWriter::get_name() @@ -91,7 +91,7 @@ std::string ClTemplateWriter::get_code() std::string ClTemplateWriter::get_config_id() { std::string config_id = get_name(); - for(const auto &comp : _components) + for (const auto &comp : _components) { config_id += "--" + comp->template_writer()->get_config_id() + "--"; } @@ -103,7 +103,7 @@ CLBuildOptions ClTemplateWriter::get_build_options() { CLBuildOptions build_opts{}; - for(const auto &comp : _components) + for (const auto &comp : _components) { build_opts.add_options(comp->template_writer()->get_build_options(_components).options()); } @@ -122,11 +122,9 @@ std::map ClTemplateWriter::get_tensors() { // Assemble GpuKernelArguments std::map tensors; - for(const auto t : _components.get_argument_tensors()) + for (const auto t : _components.get_argument_tensors()) { - tensors.emplace( - t->id(), - GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info }); + tensors.emplace(t->id(), GpuKernelArgument{*t, _vtable.get_variable(t).kernel_argument_info}); } return tensors; } @@ -141,22 +139,24 @@ std::string ClTemplateWriter::write_code() std::vector component_codes{}; // vector because order matters // Pass 1: Declare all kernel variables - for(auto &component : _components) + for (auto &component : _components) { component->template_writer()->declare_variables(_vtable, _components); } // Pass 2: Generate component codes - for(auto &component : _components) + for (auto &component : _components) { const auto component_writer = component->template_writer(); auto curr_headers_list = component_writer->get_headers_list(); auto curr_additional_macros = component_writer->get_additional_macros(); auto curr_component_code = component_writer->get_component_code(_components); - const auto var_lut = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique + const auto var_lut = component_writer->get_tag_lut( + _vtable, + _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique component_codes.push_back(replace_tags(curr_component_code, var_lut)); headers_list.insert(curr_headers_list.begin(), curr_headers_list.end()); - if(!additional_macros.empty()) // Some components might not have any + if (!additional_macros.empty()) // Some components might not have any { additional_macros.insert(replace_tags(curr_additional_macros, var_lut)); } @@ -165,7 +165,7 @@ std::string ClTemplateWriter::write_code() // Step 3: Assemble the data gathered by traversing the graph into the string "code" std::string code = ""; - for(auto &header : headers_list) + for (auto &header : headers_list) { #if defined(EMBEDDED_KERNELS) code += CLKernelLibrary::get().get_program(header).first; @@ -174,16 +174,14 @@ std::string ClTemplateWriter::write_code() #endif // defined(EMBEDDED_KERNELS) } - for(auto ¯os : additional_macros) + for (auto ¯os : additional_macros) { code += macros; } auto arguments = _components.get_argument_tensors(); - std::sort(arguments.begin(), arguments.end(), [](const ITensorInfo * l, const ITensorInfo * r) - { - return l->id() < r->id(); - }); + std::sort(arguments.begin(), arguments.end(), + [](const ITensorInfo *l, const ITensorInfo *r) { return l->id() < r->id(); }); code += write_kernel_signature(_vtable.get_variable_list(arguments)); code += "\n{\n\n"; @@ -198,7 +196,7 @@ std::string ClTemplateWriter::write_code() tiles_ss << " //------------------ START TILE DECLARATION ---------------------\n"; - for(auto tile : tiles) + for (auto tile : tiles) { const auto var = _vtable.get_variable(tile); const auto data_type = get_cl_type_from_data_type(tile->data_type()); @@ -212,7 +210,7 @@ std::string ClTemplateWriter::write_code() code += tiles_ss.str(); } - for(const auto &component_code : component_codes) + for (const auto &component_code : component_codes) { code += component_code; code += "\n"; @@ -231,7 +229,8 @@ std::string ClTemplateWriter::write_global_section() const auto leftover_w = dst_w % tile_w; std::string code = ""; - code += std::string(" int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n"; + code += std::string(" int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + + std::to_string(leftover_w) + ");\n"; code += std::string(" int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n"; code += std::string(" int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n"); @@ -243,7 +242,7 @@ std::string ClTemplateWriter::write_global_section() const std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const { std::string code; - switch(var.kernel_argument_info.type) + switch (var.kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Vector: { @@ -293,11 +292,11 @@ std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTabl { std::string code = "\n__kernel void " + write_kernel_name() + "("; - for(int i = 0; i < static_cast(argument_list.size()) - 1; ++i) + for (int i = 0; i < static_cast(argument_list.size()) - 1; ++i) { code += write_argument_declaration(argument_list[i]) + ","; } - if(static_cast(argument_list.size()) - 1 >= 0) + if (static_cast(argument_list.size()) - 1 >= 0) { code += write_argument_declaration(argument_list[argument_list.size() - 1]); } @@ -308,12 +307,12 @@ std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTabl } std::string ClTemplateWriter::write_kernel_name() const { - if(_components.empty()) + if (_components.empty()) { return "empty_kernel"; } std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name(); - for(size_t i = 1; i < _components.size(); ++i) + for (size_t i = 1; i < _components.size(); ++i) { name += "___"; name += _components[i]->template_writer()->get_name(); diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h index c891e76d8b..c157c2b21c 100644 --- a/src/dynamic_fusion/sketch/utils/DependencyGraph.h +++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH #include "arm_compute/core/Error.h" + #include #include #include @@ -68,12 +69,10 @@ public: OperatorId op{}; std::vector inputs{}; std::vector outputs{}; - friend bool operator==(const OpPack &opp0, const OpPack &opp1) + friend bool operator==(const OpPack &opp0, const OpPack &opp1) { - return std::make_tuple( - opp0.op, opp0.inputs, opp0.outputs) - == std::make_tuple( - opp1.op, opp1.inputs, opp1.outputs); + return std::make_tuple(opp0.op, opp0.inputs, opp0.outputs) == + std::make_tuple(opp1.op, opp1.inputs, opp1.outputs); } }; @@ -95,10 +94,13 @@ public: * @return true If the operator can be added while keeping the graph as a linear sequence * @return false Otherwise */ - bool try_add_operator_as_linear(OperatorId op, const std::vector &inputs, const std::vector &outputs, bool is_output = false) const + bool try_add_operator_as_linear(OperatorId op, + const std::vector &inputs, + const std::vector &outputs, + bool is_output = false) const { ARM_COMPUTE_UNUSED(op, is_output); - if(all_ops().empty()) + if (all_ops().empty()) { return true; } @@ -106,25 +108,25 @@ public: // If the new operator is not the first operator, at least one input tensor must be // the output tensor of the last non-output operator. All other input tensors must be // the global input of the graph (i.e. not the output of any operator). - if(_last_op_available) + if (_last_op_available) { auto use_input_from_last_op = false; - for(auto src_tensor : inputs) + for (auto src_tensor : inputs) { const auto src_ops = _adj_src_ops.find(src_tensor); - if(src_ops != _adj_src_ops.end()) + if (src_ops != _adj_src_ops.end()) { ARM_COMPUTE_ERROR_ON(src_ops->second.size() > 1); - if(!src_ops->second.empty()) + if (!src_ops->second.empty()) { const auto src_op = src_ops->second[0]; - if(src_op == _last_op) + if (src_op == _last_op) { - if(use_input_from_last_op) + if (use_input_from_last_op) { // To be safe, we also forbid using the output tensor // of the last operator twice. @@ -143,7 +145,7 @@ public: } } - if(!use_input_from_last_op) + if (!use_input_from_last_op) { // At least one input tensor must be the output tensor of the last non-output operator. return false; @@ -152,9 +154,9 @@ public: // The output tensor of the new operator must not be the input tensor of any previously // added operator. - for(auto dst_tensor : outputs) + for (auto dst_tensor : outputs) { - if(_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end()) + if (_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end()) { return false; } @@ -168,7 +170,10 @@ public: * INVARIANT: The list can only grow from head to tail * INVARIANT: POSTCONDITION: The graph is linear */ - void add_operator_as_linear(OperatorId op, const std::vector &inputs, const std::vector &outputs, bool is_output = false) + void add_operator_as_linear(OperatorId op, + const std::vector &inputs, + const std::vector &outputs, + bool is_output = false) { const auto success = add_operator(op, inputs, outputs, is_output); ARM_COMPUTE_UNUSED(success); @@ -183,24 +188,27 @@ public: * @param[in] outputs Output tensors to the operator * @param[in] is_output Whether this is an output operator */ - bool add_operator(OperatorId op, const std::vector &inputs, const std::vector &outputs, bool is_output = false) + bool add_operator(OperatorId op, + const std::vector &inputs, + const std::vector &outputs, + bool is_output = false) { - if(operator_exists(op)) + if (operator_exists(op)) { return false; } _adj_src_tensors[op] = {}; _adj_dst_tensors[op] = {}; - for(auto in_tensor : inputs) + for (auto in_tensor : inputs) { // Linking input tensor to operator node will never create a cycle / loop because we guarantee // each op is newly created, so every pair / edge is new link_input(op, in_tensor); } - for(auto out_tensor : outputs) + for (auto out_tensor : outputs) { // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle - if(path_exists_from_tensor_to_op(out_tensor, op)) + if (path_exists_from_tensor_to_op(out_tensor, op)) { remove_operator(op); return false; @@ -211,10 +219,10 @@ public: } } - if(!is_output) + if (!is_output) { _last_op_available = true; - _last_op = op; + _last_op = op; } return true; @@ -230,16 +238,16 @@ public: std::vector build_operators_sequence() const { std::vector ops_seq; - std::set done_ops; - std::set done_tensors; + std::set done_ops; + std::set done_tensors; const auto input_tensors = global_src_tensors(); - for(auto tensor : input_tensors) + for (auto tensor : input_tensors) { done_tensors.insert(tensor); - for(auto op : _adj_dst_ops.at(tensor)) + for (auto op : _adj_dst_ops.at(tensor)) { build_operators_sequence_from_op(op, ops_seq, done_ops, done_tensors); } @@ -260,10 +268,8 @@ public: friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1) { // Do not compare id allocators - return std::make_tuple( - g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops) - == std::make_tuple( - g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops); + return std::make_tuple(g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops) == + std::make_tuple(g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops); } std::vector src_ops_from_tensor(TensorId tensor) const { @@ -280,10 +286,8 @@ public: std::vector all_tensors() const { std::vector tensors{}; - std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it) - { - return it.first; - }); + std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), + [](const auto &it) { return it.first; }); return tensors; } /** Get source tensors of the whole graph @@ -293,9 +297,9 @@ public: std::vector global_src_tensors() const { std::vector tensors; - for(auto tensor_src_ops : _adj_src_ops) + for (auto tensor_src_ops : _adj_src_ops) { - if(tensor_src_ops.second.empty()) + if (tensor_src_ops.second.empty()) { tensors.push_back(tensor_src_ops.first); } @@ -309,9 +313,9 @@ public: std::vector global_dst_tensors() const { std::vector tensors; - for(auto tensor_dst_ops : _adj_dst_ops) + for (auto tensor_dst_ops : _adj_dst_ops) { - if(tensor_dst_ops.second.empty()) + if (tensor_dst_ops.second.empty()) { tensors.push_back(tensor_dst_ops.first); } @@ -328,14 +332,14 @@ public: // If a tensor is used to connect the input of an operator and the output of another operator, // it is not allocated in the memory. The tensor exists as a temporary variable only. - for(auto src_tensor : _adj_src_ops) + for (auto src_tensor : _adj_src_ops) { - if(!src_tensor.second.empty()) + if (!src_tensor.second.empty()) { const auto dst_tensor = _adj_dst_ops.find(src_tensor.first); - if(dst_tensor != _adj_dst_ops.end()) + if (dst_tensor != _adj_dst_ops.end()) { - if(!dst_tensor->second.empty()) + if (!dst_tensor->second.empty()) { tensors.push_back(src_tensor.first); } @@ -354,9 +358,9 @@ public: std::vector ops{}; const auto op_list = all_ops(); - for(auto op : op_list) + for (auto op : op_list) { - if(src_ops(op).empty()) + if (src_ops(op).empty()) { ops.emplace_back(op); } @@ -368,7 +372,7 @@ private: void link_input(OperatorId op, TensorId in_tensor) { ARM_COMPUTE_ERROR_ON(!operator_exists(op)); - if(!tensor_exists(in_tensor)) + if (!tensor_exists(in_tensor)) { insert_new_tensor(in_tensor); } @@ -379,7 +383,7 @@ private: void link_output(OperatorId op, TensorId out_tensor) { ARM_COMPUTE_ERROR_ON(!operator_exists(op)); - if(!tensor_exists(out_tensor)) + if (!tensor_exists(out_tensor)) { insert_new_tensor(out_tensor); } @@ -392,7 +396,7 @@ private: { ARM_COMPUTE_ERROR_ON(!operator_exists(op)); std::vector ops{}; - for(TensorId src_tensor : src_tensors(op)) + for (TensorId src_tensor : src_tensors(op)) { ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor))); } @@ -402,7 +406,7 @@ private: { ARM_COMPUTE_ERROR_ON(!operator_exists(op)); std::vector ops{}; - for(TensorId dst_tensor : _adj_dst_tensors.at(op)) + for (TensorId dst_tensor : _adj_dst_tensors.at(op)) { ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor))); } @@ -436,10 +440,8 @@ private: std::vector all_ops() const { std::vector ops{}; - std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it) - { - return it.first; - }); + std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), + [](const auto &it) { return it.first; }); return ops; } /** Remove an operator from graph. @@ -448,25 +450,21 @@ private: */ void remove_operator(OperatorId op) { - for(auto src_tensor : _adj_src_tensors.at(op)) + for (auto src_tensor : _adj_src_tensors.at(op)) { auto &dst_ops = _adj_dst_ops.at(src_tensor); - dst_ops.erase( - std::remove(std::begin(dst_ops), std::end(dst_ops), op), - std::end(dst_ops)); + dst_ops.erase(std::remove(std::begin(dst_ops), std::end(dst_ops), op), std::end(dst_ops)); } - for(auto dst_tensor : _adj_dst_tensors.at(op)) + for (auto dst_tensor : _adj_dst_tensors.at(op)) { auto &src_ops = _adj_src_ops.at(dst_tensor); - src_ops.erase( - std::remove(std::begin(src_ops), std::end(src_ops), op), - std::end(src_ops)); + src_ops.erase(std::remove(std::begin(src_ops), std::end(src_ops), op), std::end(src_ops)); } // Remove any isolated tensors // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty - for(auto t : all_tensors()) + for (auto t : all_tensors()) { - if(_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty()) + if (_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty()) { _adj_src_ops.erase(t); _adj_dst_ops.erase(t); @@ -486,11 +484,12 @@ private: } bool operator_exists(OperatorId op) const { - return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end(); + return _adj_src_tensors.find(op) != _adj_src_tensors.end() && + _adj_dst_tensors.find(op) != _adj_dst_tensors.end(); } bool is_src_tensor_of(OperatorId op, TensorId tensor) const { - if(!operator_exists(op) || !tensor_exists(tensor)) + if (!operator_exists(op) || !tensor_exists(tensor)) { return false; } @@ -499,7 +498,7 @@ private: } bool is_dst_tensor_of(OperatorId op, TensorId tensor) const { - if(!operator_exists(op) || !tensor_exists(tensor)) + if (!operator_exists(op) || !tensor_exists(tensor)) { return false; } @@ -525,9 +524,9 @@ private: std::vector ops{}; const auto op_list = all_ops(); - for(auto op : op_list) + for (auto op : op_list) { - if(is_dst_op(op)) + if (is_dst_op(op)) { ops.emplace_back(op); } @@ -536,13 +535,13 @@ private: } bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const { - if(!tensor_exists(src_tensor) || !operator_exists(dst_op)) + if (!tensor_exists(src_tensor) || !operator_exists(dst_op)) { return false; } - for(auto child_op : dst_ops_from_tensor(src_tensor)) + for (auto child_op : dst_ops_from_tensor(src_tensor)) { - if(path_exists_from_op_to_op(child_op, dst_op)) + if (path_exists_from_op_to_op(child_op, dst_op)) { return true; } @@ -552,21 +551,21 @@ private: bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const { - if(!operator_exists(src_op) || !operator_exists(dst_op)) + if (!operator_exists(src_op) || !operator_exists(dst_op)) { return false; } - if(src_op == dst_op) + if (src_op == dst_op) { return true; } - if(is_in(src_op, get_dst_ops())) + if (is_in(src_op, get_dst_ops())) { return false; } - for(auto child_tensor : dst_tensors(src_op)) + for (auto child_tensor : dst_tensors(src_op)) { - if(path_exists_from_tensor_to_op(child_tensor, dst_op)) + if (path_exists_from_tensor_to_op(child_tensor, dst_op)) { return true; } @@ -574,16 +573,15 @@ private: return false; } - void build_operators_sequence_from_op( - Id op, - std::vector &ops_seq, - std::set &done_ops, - std::set &done_tensors) const + void build_operators_sequence_from_op(Id op, + std::vector &ops_seq, + std::set &done_ops, + std::set &done_tensors) const { - while(true) + while (true) { // If the operator has been added to the sequence, ignore it. - if(done_ops.find(op) != done_ops.end()) + if (done_ops.find(op) != done_ops.end()) { return; } @@ -593,9 +591,9 @@ private: // is added to the sequence. const auto src_tensors = _adj_src_tensors.at(op); - for(auto src : src_tensors) + for (auto src : src_tensors) { - if(done_tensors.find(src) == done_tensors.end()) + if (done_tensors.find(src) == done_tensors.end()) { return; } @@ -606,24 +604,24 @@ private: done_ops.insert(op); - OpPack pack{ op, src_tensors, dst_tensors }; + OpPack pack{op, src_tensors, dst_tensors}; ops_seq.push_back(pack); done_tensors.insert(dst_tensors.begin(), dst_tensors.end()); // Visit all the sink operators. // Call this function recursively unless there is only one sink. - if(dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1) + if (dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1) { op = _adj_dst_ops.at(dst_tensors[0])[0]; } else { - for(auto dst_tensor : dst_tensors) + for (auto dst_tensor : dst_tensors) { const auto dst_ops = _adj_dst_ops.at(dst_tensor); - for(auto dst_op : dst_ops) + for (auto dst_op : dst_ops) { build_operators_sequence_from_op(dst_op, ops_seq, done_ops, done_tensors); } @@ -640,8 +638,8 @@ private: AdjList _adj_src_ops{}; AdjList _adj_dst_ops{}; - bool _last_op_available{ false }; - OperatorId _last_op{ 0 }; + bool _last_op_available{false}; + OperatorId _last_op{0}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/utils/Utils.h b/src/dynamic_fusion/utils/Utils.h index c9fc2c610f..3f4a2edd03 100644 --- a/src/dynamic_fusion/utils/Utils.h +++ b/src/dynamic_fusion/utils/Utils.h @@ -63,17 +63,21 @@ inline bool is_invalid_tensor(const ITensorInfo *tensor_info) /** Inline function to convert @ref Pool2dAttributes to PoolingLayerInfo */ -inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr, bool mixed_precision = false, DataLayout data_layout = DataLayout::NHWC) +inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr, + bool mixed_precision = false, + DataLayout data_layout = DataLayout::NHWC) { // Create PadStrideInfo const Size2D stride = pool_attr.stride(); const Padding2D padding = pool_attr.pad(); - const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top, arm_compute::DimensionRoundingType::FLOOR); + const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top, + arm_compute::DimensionRoundingType::FLOOR); - return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride, pool_attr.exclude_padding(), mixed_precision); -} -} -} + return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride, + pool_attr.exclude_padding(), mixed_precision); } +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute #endif /* SRC_DYNAMIC_FUSION_UTILS_UTILS */ diff --git a/src/gpu/cl/ClContext.cpp b/src/gpu/cl/ClContext.cpp index d8ef18e62e..611c1cb501 100644 --- a/src/gpu/cl/ClContext.cpp +++ b/src/gpu/cl/ClContext.cpp @@ -23,11 +23,11 @@ */ #include "src/gpu/cl/ClContext.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" + #include "src/gpu/cl/ClQueue.h" #include "src/gpu/cl/ClTensor.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" - namespace arm_compute { namespace gpu @@ -41,7 +41,7 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename) bool status = false; mlgo::MLGOHeuristics heuristics; - if(filename != nullptr) + if (filename != nullptr) { status = heuristics.reload_from_file(filename); } @@ -50,12 +50,9 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename) } // namespace ClContext::ClContext(const AclContextOptions *options) - : IContext(Target::GpuOcl), - _mlgo_heuristics(), - _cl_ctx(), - _cl_dev() + : IContext(Target::GpuOcl), _mlgo_heuristics(), _cl_ctx(), _cl_dev() { - if(options != nullptr) + if (options != nullptr) { _mlgo_heuristics = populate_mlgo(options->kernel_config_file); } @@ -80,7 +77,7 @@ const mlgo::MLGOHeuristics &ClContext::mlgo() const bool ClContext::set_cl_ctx(::cl::Context ctx) { - if(this->refcount() == 0) + if (this->refcount() == 0) { _cl_ctx = ctx; CLScheduler::get().set_context(ctx); @@ -92,7 +89,7 @@ bool ClContext::set_cl_ctx(::cl::Context ctx) ITensorV2 *ClContext::create_tensor(const AclTensorDescriptor &desc, bool allocate) { ClTensor *tensor = new ClTensor(this, desc); - if(tensor != nullptr && allocate) + if (tensor != nullptr && allocate) { tensor->allocate(); } diff --git a/src/gpu/cl/ClContext.h b/src/gpu/cl/ClContext.h index a50b03124b..2c67ccf4d2 100644 --- a/src/gpu/cl/ClContext.h +++ b/src/gpu/cl/ClContext.h @@ -24,11 +24,11 @@ #ifndef SRC_GPU_CLCONTEXT_H #define SRC_GPU_CLCONTEXT_H +#include "arm_compute/core/CL/OpenCL.h" + #include "src/common/IContext.h" #include "src/runtime/CL/mlgo/MLGOHeuristics.h" -#include "arm_compute/core/CL/OpenCL.h" - namespace arm_compute { namespace gpu @@ -74,9 +74,9 @@ public: bool set_cl_ctx(::cl::Context ctx); // Inherrited methods overridden - ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override; - IQueue *create_queue(const AclQueueOptions *options) override; - std::tuple create_activation(const AclTensorDescriptor &src, + ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override; + IQueue *create_queue(const AclQueueOptions *options) override; + std::tuple create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) override; @@ -90,4 +90,4 @@ private: } // namespace gpu } // namespace arm_compute -#endif /* SRC_GPU_CLCONTEXT_H */ \ No newline at end of file +#endif /* SRC_GPU_CLCONTEXT_H */ diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp index 73bb96298e..bcade94522 100644 --- a/src/gpu/cl/ClKernelLibrary.cpp +++ b/src/gpu/cl/ClKernelLibrary.cpp @@ -37,24 +37,16 @@ namespace { /* Decoding table */ -constexpr std::array b64_invtab = -{ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63, - 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0, - 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, - 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +constexpr std::array b64_invtab = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 0, 0, 0, 0, 0, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /** Decode a base64 encoded string @@ -68,13 +60,13 @@ std::string decode_base64(const std::string &str) constexpr const char pad_char = '='; // Handle empty string - if(str.empty()) + if (str.empty()) { return {}; } // Base64 encoded string has size multiple of 4 - if(str.length() % 4) + if (str.length() % 4) { return {}; } @@ -92,7 +84,7 @@ std::string decode_base64(const std::string &str) // Block decoding function (exclude padding) int c = 0; const int end = str_len - 4 - padding; - for(; c <= end; c += 4) + for (; c <= end; c += 4) { const int byte0 = b64_invtab[str[c]]; const int byte1 = b64_invtab[str[c + 1]]; @@ -105,7 +97,7 @@ std::string decode_base64(const std::string &str) } // Last step that might contain padding symbols - if(padding == 1) + if (padding == 1) { const int byte0 = b64_invtab[str[c]]; const int byte1 = b64_invtab[str[c + 1]]; @@ -114,7 +106,7 @@ std::string decode_base64(const std::string &str) dec_b64.push_back((byte0 << 2) | (byte1 >> 4)); dec_b64.push_back((byte1 << 4) | (byte2 >> 2)); } - else if(padding == 2) + else if (padding == 2) { const int byte0 = b64_invtab[str[c]]; const int byte1 = b64_invtab[str[c + 1]]; @@ -135,7 +127,7 @@ std::string decompress_zlib(const std::string &str) { // Create and initialize decompression stream z_stream ds{}; - if(inflateInit(&ds) != Z_OK) + if (inflateInit(&ds) != Z_OK) { return std::string(); } @@ -152,16 +144,15 @@ std::string decompress_zlib(const std::string &str) ds.next_out = reinterpret_cast(roll_buff); status = inflate(&ds, 0); - if(inflated_str.size() < ds.total_out) + if (inflated_str.size() < ds.total_out) { inflated_str.append(roll_buff, ds.total_out - inflated_str.size()); } - } - while(status == Z_OK); + } while (status == Z_OK); // Finalize decompression stream inflateEnd(&ds); - if(status != Z_STREAM_END) + if (status != Z_STREAM_END) { return std::string(); } @@ -175,323 +166,321 @@ namespace arm_compute { namespace opencl { -const std::map ClKernelLibrary::_kernel_program_map = -{ +const std::map ClKernelLibrary::_kernel_program_map = { // Common Kernels - { "activation_layer", "common/activation_layer.cl" }, - { "activation_layer_quant", "common/activation_layer_quant.cl" }, - { "activation_layer_quant_f32", "common/activation_layer_quant.cl" }, - { "arg_min_max_x", "common/arg_min_max.cl" }, - { "arg_min_max_y", "common/arg_min_max.cl" }, - { "arg_min_max_z", "common/arg_min_max.cl" }, - { "arg_min_max_w", "common/arg_min_max.cl" }, - { "bitwise_or", "common/bitwise_op.cl" }, - { "bitwise_and", "common/bitwise_op.cl" }, - { "bitwise_xor", "common/bitwise_op.cl" }, - { "bitwise_not", "common/bitwise_op.cl" }, - { "bounding_box_transform", "common/bounding_box_transform.cl" }, - { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" }, - { "compare_equal", "common/comparisons.cl" }, - { "compare_equal_quantized", "common/comparisons.cl" }, - { "compare_notequal", "common/comparisons.cl" }, - { "compare_notequal_quantized", "common/comparisons.cl" }, - { "compare_greater", "common/comparisons.cl" }, - { "compare_greater_quantized", "common/comparisons.cl" }, - { "compare_greaterequal", "common/comparisons.cl" }, - { "compare_greaterequal_quantized", "common/comparisons.cl" }, - { "compare_less", "common/comparisons.cl" }, - { "compare_less_quantized", "common/comparisons.cl" }, - { "compare_lessequal", "common/comparisons.cl" }, - { "compare_lessequal_quantized", "common/comparisons.cl" }, - { "concatenate", "common/concatenate.cl" }, - { "concatenate_width", "common/concatenate.cl" }, - { "concatenate_height", "common/concatenate.cl" }, - { "concatenate_width_x2", "common/concatenate.cl" }, - { "concatenate_width_x4", "common/concatenate.cl" }, - { "col2im", "common/col2im.cl" }, - { "cast_down", "common/cast.cl" }, - { "cast_up", "common/cast.cl" }, - { "convert_fc_weights", "common/convert_fc_weights.cl" }, - { "copy_tensor", "common/copy_tensor.cl" }, - { "crop_tensor", "common/crop_tensor.cl" }, - { "deconvolution_reshape", "common/deconvolution_layer.cl" }, - { "deconvolution_upsample", "common/deconvolution_layer.cl" }, - { "dequantization_layer", "common/dequantization_layer.cl" }, - { "elementwise_operation_ADD", "common/elementwise_operation.cl" }, - { "elementwise_operation_SUB", "common/elementwise_operation.cl" }, - { "elementwise_operation_MAX", "common/elementwise_operation.cl" }, - { "elementwise_operation_MIN", "common/elementwise_operation.cl" }, - { "elementwise_operation_DIV", "common/elementwise_operation.cl" }, - { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" }, - { "elementwise_operation_POWER", "common/elementwise_operation.cl" }, - { "elementwise_operation_PRELU", "common/elementwise_operation.cl" }, - { "elementwise_operation_AND", "common/elementwise_operation.cl" }, - { "elementwise_operation_OR", "common/elementwise_operation.cl" }, - { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_unary", "common/elementwise_unary.cl" }, - { "elementwise_unary_quantized", "common/elementwise_unary_quantized.cl" }, - { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" }, - { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" }, - { "fft_radix_2_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_2_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_2_axis_0", "common/fft.cl" }, - { "fft_radix_2_axis_1", "common/fft.cl" }, - { "fft_radix_3_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_3_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_3_axis_0", "common/fft.cl" }, - { "fft_radix_3_axis_1", "common/fft.cl" }, - { "fft_radix_4_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_4_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_4_axis_0", "common/fft.cl" }, - { "fft_radix_4_axis_1", "common/fft.cl" }, - { "fft_radix_5_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_5_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_5_axis_0", "common/fft.cl" }, - { "fft_radix_5_axis_1", "common/fft.cl" }, - { "fft_radix_7_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_7_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_7_axis_0", "common/fft.cl" }, - { "fft_radix_7_axis_1", "common/fft.cl" }, - { "fft_radix_8_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_8_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_8_axis_0", "common/fft.cl" }, - { "fft_radix_8_axis_1", "common/fft.cl" }, - { "fft_scale_conj", "common/fft_scale.cl" }, - { "fill_image_borders_constant", "common/fill_border.cl" }, - { "fill_image_borders_replicate", "common/fill_border.cl" }, - { "floor_layer", "common/floor.cl" }, - { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" }, - { "gather", "common/gather.cl" }, - { "gemm_ma_f16", "common/gemm.cl" }, - { "gemm_ma_f32", "common/gemm.cl" }, - { "gemm_mv", "common/gemv.cl" }, - { "gemm_mv_quantized", "common/gemv.cl" }, - { "gemm_mm_native", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl" }, - { "gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl" }, - { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" }, - { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" }, - { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" }, - { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" }, - { "gemm_lc_vm_f32", "common/gemm.cl" }, - { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" }, - { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" }, - { "gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl" }, - { "gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl" }, - { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" }, - { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" }, - { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" }, - { "gemmlowp_mm_native", "common/gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl" }, - { "gemmlowp_offset_contribution", "common/gemmlowp.cl" }, - { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" }, - { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" }, - { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" }, - { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" }, - { "instance_normalization", "common/instance_normalization.cl" }, - { "compute_mean_var", "common/instance_normalization.cl" }, - { "l2_normalize_x", "common/l2_normalize.cl" }, - { "l2_normalize_y", "common/l2_normalize.cl" }, - { "l2_normalize_z", "common/l2_normalize.cl" }, - { "mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl" }, - { "mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl" }, - { "mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl" }, - { "mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl" }, - { "mat_mul_native_nt_nt", "common/mat_mul.cl" }, - { "mat_mul_native_nt_t", "common/mat_mul.cl" }, - { "mat_mul_native_t_nt", "common/mat_mul.cl" }, - { "mat_mul_native_t_t", "common/mat_mul.cl" }, - { "mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl" }, - { "mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl" }, - { "mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl" }, - { "mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl" }, - { "mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl" }, - { "mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl" }, - { "mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl" }, - { "mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl" }, - { "max_unpooling_layer_2", "common/unpooling_layer.cl" }, - { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" }, - { "memset", "common/memset.cl" }, - { "minmax_layer", "common/minmax_layer.cl" }, - { "non_max_suppression", "common/nonmax.cl" }, - { "pad_layer_constant", "common/pad_layer.cl" }, - { "pad_layer_symmetric_reflect", "common/pad_layer.cl" }, - { "permute", "common/permute.cl" }, - { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" }, - { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" }, - { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" }, - { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" }, - { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" }, - { "quantization_layer", "common/quantization_layer.cl" }, - { "range", "common/range.cl" }, - { "range_quantized", "common/range.cl" }, - { "reduction_operation_x", "common/reduction_operation.cl" }, - { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" }, - { "reduction_operation_y", "common/reduction_operation.cl" }, - { "reduction_operation_z", "common/reduction_operation.cl" }, - { "reduction_operation_w", "common/reduction_operation.cl" }, - { "reshape_layer", "common/reshape_layer.cl" }, - { "reshape_to_columns", "common/convolution_layer.cl" }, - { "reverse", "common/reverse.cl" }, - { "roi_align_layer", "common/roi_align_layer.cl" }, - { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" }, - { "roi_pooling_layer", "common/roi_pooling_layer.cl" }, - { "select_same_rank", "common/select.cl" }, - { "select_different_rank_2", "common/select.cl" }, - { "select_different_rank_n", "common/select.cl" }, - { "softmax_layer_norm", "common/softmax_layer.cl" }, - { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" }, - { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" }, - { "stack_layer", "common/stack_layer.cl" }, - { "strided_slice", "common/slice_ops.cl" }, - { "tile", "common/tile.cl" }, - { "transpose", "common/transpose.cl" }, + {"activation_layer", "common/activation_layer.cl"}, + {"activation_layer_quant", "common/activation_layer_quant.cl"}, + {"activation_layer_quant_f32", "common/activation_layer_quant.cl"}, + {"arg_min_max_x", "common/arg_min_max.cl"}, + {"arg_min_max_y", "common/arg_min_max.cl"}, + {"arg_min_max_z", "common/arg_min_max.cl"}, + {"arg_min_max_w", "common/arg_min_max.cl"}, + {"bitwise_or", "common/bitwise_op.cl"}, + {"bitwise_and", "common/bitwise_op.cl"}, + {"bitwise_xor", "common/bitwise_op.cl"}, + {"bitwise_not", "common/bitwise_op.cl"}, + {"bounding_box_transform", "common/bounding_box_transform.cl"}, + {"bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl"}, + {"compare_equal", "common/comparisons.cl"}, + {"compare_equal_quantized", "common/comparisons.cl"}, + {"compare_notequal", "common/comparisons.cl"}, + {"compare_notequal_quantized", "common/comparisons.cl"}, + {"compare_greater", "common/comparisons.cl"}, + {"compare_greater_quantized", "common/comparisons.cl"}, + {"compare_greaterequal", "common/comparisons.cl"}, + {"compare_greaterequal_quantized", "common/comparisons.cl"}, + {"compare_less", "common/comparisons.cl"}, + {"compare_less_quantized", "common/comparisons.cl"}, + {"compare_lessequal", "common/comparisons.cl"}, + {"compare_lessequal_quantized", "common/comparisons.cl"}, + {"concatenate", "common/concatenate.cl"}, + {"concatenate_width", "common/concatenate.cl"}, + {"concatenate_height", "common/concatenate.cl"}, + {"concatenate_width_x2", "common/concatenate.cl"}, + {"concatenate_width_x4", "common/concatenate.cl"}, + {"col2im", "common/col2im.cl"}, + {"cast_down", "common/cast.cl"}, + {"cast_up", "common/cast.cl"}, + {"convert_fc_weights", "common/convert_fc_weights.cl"}, + {"copy_tensor", "common/copy_tensor.cl"}, + {"crop_tensor", "common/crop_tensor.cl"}, + {"deconvolution_reshape", "common/deconvolution_layer.cl"}, + {"deconvolution_upsample", "common/deconvolution_layer.cl"}, + {"dequantization_layer", "common/dequantization_layer.cl"}, + {"elementwise_operation_ADD", "common/elementwise_operation.cl"}, + {"elementwise_operation_SUB", "common/elementwise_operation.cl"}, + {"elementwise_operation_MAX", "common/elementwise_operation.cl"}, + {"elementwise_operation_MIN", "common/elementwise_operation.cl"}, + {"elementwise_operation_DIV", "common/elementwise_operation.cl"}, + {"elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl"}, + {"elementwise_operation_POWER", "common/elementwise_operation.cl"}, + {"elementwise_operation_PRELU", "common/elementwise_operation.cl"}, + {"elementwise_operation_AND", "common/elementwise_operation.cl"}, + {"elementwise_operation_OR", "common/elementwise_operation.cl"}, + {"elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_unary", "common/elementwise_unary.cl"}, + {"elementwise_unary_quantized", "common/elementwise_unary_quantized.cl"}, + {"fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl"}, + {"fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl"}, + {"fft_radix_2_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_2_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_2_axis_0", "common/fft.cl"}, + {"fft_radix_2_axis_1", "common/fft.cl"}, + {"fft_radix_3_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_3_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_3_axis_0", "common/fft.cl"}, + {"fft_radix_3_axis_1", "common/fft.cl"}, + {"fft_radix_4_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_4_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_4_axis_0", "common/fft.cl"}, + {"fft_radix_4_axis_1", "common/fft.cl"}, + {"fft_radix_5_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_5_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_5_axis_0", "common/fft.cl"}, + {"fft_radix_5_axis_1", "common/fft.cl"}, + {"fft_radix_7_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_7_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_7_axis_0", "common/fft.cl"}, + {"fft_radix_7_axis_1", "common/fft.cl"}, + {"fft_radix_8_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_8_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_8_axis_0", "common/fft.cl"}, + {"fft_radix_8_axis_1", "common/fft.cl"}, + {"fft_scale_conj", "common/fft_scale.cl"}, + {"fill_image_borders_constant", "common/fill_border.cl"}, + {"fill_image_borders_replicate", "common/fill_border.cl"}, + {"floor_layer", "common/floor.cl"}, + {"fuse_batchnormalization_layer", "common/batchnormalization_layer.cl"}, + {"gather", "common/gather.cl"}, + {"gemm_ma_f16", "common/gemm.cl"}, + {"gemm_ma_f32", "common/gemm.cl"}, + {"gemm_mv", "common/gemv.cl"}, + {"gemm_mv_quantized", "common/gemv.cl"}, + {"gemm_mm_native", "common/gemm.cl"}, + {"gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl"}, + {"gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl"}, + {"gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl"}, + {"gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl"}, + {"gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl"}, + {"gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl"}, + {"gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl"}, + {"gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl"}, + {"gemm_mm_reshaped_only_rhs_t", "common/gemm.cl"}, + {"gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl"}, + {"gemm_lc_vm_f32", "common/gemm.cl"}, + {"gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl"}, + {"gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl"}, + {"gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl"}, + {"gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl"}, + {"gemmlowp_matrix_a_reduction", "common/gemmlowp.cl"}, + {"gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl"}, + {"gemmlowp_matrix_b_reduction", "common/gemmlowp.cl"}, + {"gemmlowp_mm_native", "common/gemmlowp.cl"}, + {"gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl"}, + {"gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl"}, + {"gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl"}, + {"gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl"}, + {"gemmlowp_offset_contribution", "common/gemmlowp.cl"}, + {"gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl"}, + {"gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl"}, + {"gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl"}, + {"gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl"}, + {"gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl"}, + {"gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl"}, + {"generate_proposals_compute_all_anchors", "common/generate_proposals.cl"}, + {"generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl"}, + {"instance_normalization", "common/instance_normalization.cl"}, + {"compute_mean_var", "common/instance_normalization.cl"}, + {"l2_normalize_x", "common/l2_normalize.cl"}, + {"l2_normalize_y", "common/l2_normalize.cl"}, + {"l2_normalize_z", "common/l2_normalize.cl"}, + {"mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl"}, + {"mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl"}, + {"mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl"}, + {"mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl"}, + {"mat_mul_native_nt_nt", "common/mat_mul.cl"}, + {"mat_mul_native_nt_t", "common/mat_mul.cl"}, + {"mat_mul_native_t_nt", "common/mat_mul.cl"}, + {"mat_mul_native_t_t", "common/mat_mul.cl"}, + {"mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl"}, + {"mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl"}, + {"mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl"}, + {"mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl"}, + {"mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl"}, + {"mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl"}, + {"mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl"}, + {"mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl"}, + {"max_unpooling_layer_2", "common/unpooling_layer.cl"}, + {"mean_stddev_normalization", "common/mean_stddev_normalization.cl"}, + {"memset", "common/memset.cl"}, + {"minmax_layer", "common/minmax_layer.cl"}, + {"non_max_suppression", "common/nonmax.cl"}, + {"pad_layer_constant", "common/pad_layer.cl"}, + {"pad_layer_symmetric_reflect", "common/pad_layer.cl"}, + {"permute", "common/permute.cl"}, + {"pixelwise_mul_complex", "common/pixelwise_mul_float.cl"}, + {"pixelwise_mul_float", "common/pixelwise_mul_float.cl"}, + {"pixelwise_mul_int", "common/pixelwise_mul_int.cl"}, + {"pixelwise_mul_quantized", "common/pixelwise_mul_int.cl"}, + {"qlstm_layer_normalization", "common/qlstm_layer_normalization.cl"}, + {"quantization_layer", "common/quantization_layer.cl"}, + {"range", "common/range.cl"}, + {"range_quantized", "common/range.cl"}, + {"reduction_operation_x", "common/reduction_operation.cl"}, + {"reduction_operation_non_parallel_x", "common/reduction_operation.cl"}, + {"reduction_operation_y", "common/reduction_operation.cl"}, + {"reduction_operation_z", "common/reduction_operation.cl"}, + {"reduction_operation_w", "common/reduction_operation.cl"}, + {"reshape_layer", "common/reshape_layer.cl"}, + {"reshape_to_columns", "common/convolution_layer.cl"}, + {"reverse", "common/reverse.cl"}, + {"roi_align_layer", "common/roi_align_layer.cl"}, + {"roi_align_layer_quantized", "common/roi_align_layer_quantized.cl"}, + {"roi_pooling_layer", "common/roi_pooling_layer.cl"}, + {"select_same_rank", "common/select.cl"}, + {"select_different_rank_2", "common/select.cl"}, + {"select_different_rank_n", "common/select.cl"}, + {"softmax_layer_norm", "common/softmax_layer.cl"}, + {"softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl"}, + {"softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl"}, + {"softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl"}, + {"softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl"}, + {"softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl"}, + {"stack_layer", "common/stack_layer.cl"}, + {"strided_slice", "common/slice_ops.cl"}, + {"tile", "common/tile.cl"}, + {"transpose", "common/transpose.cl"}, #ifdef ENABLE_NCHW_KERNELS - { "batch_to_space_nchw", "nchw/batch_to_space.cl" }, - { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" }, - { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" }, - { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" }, - { "depth_to_space_nchw", "nchw/depth_to_space.cl" }, - { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" }, - { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" }, - { "direct_convolution_nchw", "nchw/direct_convolution.cl" }, + {"batch_to_space_nchw", "nchw/batch_to_space.cl"}, + {"batch_to_space_static_nchw", "nchw/batch_to_space.cl"}, + {"batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl"}, + {"channel_shuffle_nchw", "nchw/channel_shuffle.cl"}, + {"depth_to_space_nchw", "nchw/depth_to_space.cl"}, + {"dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl"}, + {"direct_convolution1x1", "nchw/direct_convolution1x1.cl"}, + {"direct_convolution_nchw", "nchw/direct_convolution.cl"}, - { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" }, - { "im2col3x3_nchw", "nchw/im2col.cl" }, - { "im2col5x5_nchw", "nchw/im2col.cl" }, - { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" }, - { "im2col_generic_nchw", "nchw/im2col.cl" }, - { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" }, - { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" }, - { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" }, - { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" }, - { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" }, - { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" }, - { "pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl" }, - { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" }, - { "reorg_layer_nchw", "nchw/reorg_layer.cl" }, - { "scale_nearest_neighbour_nchw", "nchw/scale.cl" }, - { "scale_bilinear_nchw", "nchw/scale.cl" }, - { "space_to_batch_nchw", "nchw/space_to_batch.cl" }, - { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" }, - { "space_to_depth_nchw", "nchw/space_to_depth.cl" }, - { "upsample_layer_nchw", "nchw/upsample_layer.cl" }, - { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" }, + {"im2col1x1_stridex1_nchw", "nchw/im2col.cl"}, + {"im2col3x3_nchw", "nchw/im2col.cl"}, + {"im2col5x5_nchw", "nchw/im2col.cl"}, + {"im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl"}, + {"im2col_generic_nchw", "nchw/im2col.cl"}, + {"im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl"}, + {"normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl"}, + {"normalization_layer_in_map_nchw", "nchw/normalization_layer.cl"}, + {"normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl"}, + {"normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl"}, + {"pooling_layer_MxN_nchw", "nchw/pooling_layer.cl"}, + {"pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl"}, + {"prior_box_layer_nchw", "nchw/prior_box_layer.cl"}, + {"reorg_layer_nchw", "nchw/reorg_layer.cl"}, + {"scale_nearest_neighbour_nchw", "nchw/scale.cl"}, + {"scale_bilinear_nchw", "nchw/scale.cl"}, + {"space_to_batch_nchw", "nchw/space_to_batch.cl"}, + {"space_to_batch_static_nchw", "nchw/space_to_batch.cl"}, + {"space_to_depth_nchw", "nchw/space_to_depth.cl"}, + {"upsample_layer_nchw", "nchw/upsample_layer.cl"}, + {"winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl"}, #endif /* ENABLE_NCHW_KERNELS */ #ifdef ENABLE_NHWC_KERNELS - { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" }, - { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" }, - { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" }, - { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" }, - { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" }, - { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" }, - { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" }, - { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" }, - { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" }, - { "direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl" }, - { "im2col3x3_nhwc", "nhwc/im2col.cl" }, - { "im2col9x9_nhwc", "nhwc/im2col.cl" }, - { "im2col_generic_nhwc", "nhwc/im2col.cl" }, - { "indirect_convolution_nhwc", "nhwc/indirect_convolution.cl" }, - { "indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl" }, - { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" }, - { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" }, - { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" }, - { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" }, - { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" }, - { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" }, - { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" }, - { "pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl" }, - { "pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl" }, - { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" }, - { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" }, - { "scale_bilinear_nhwc", "nhwc/scale.cl" }, - { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" }, - { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" }, - { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" }, - { "transposed_convolution_nhwc", "nhwc/transposed_convolution.cl" }, - { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" }, - { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" }, + {"batch_to_space_nhwc", "nhwc/batch_to_space.cl"}, + {"batch_to_space_static_nhwc", "nhwc/batch_to_space.cl"}, + {"batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl"}, + {"channel_shuffle_nhwc", "nhwc/channel_shuffle.cl"}, + {"depth_to_space_nhwc", "nhwc/depth_to_space.cl"}, + {"dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl"}, + {"dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl"}, + {"dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl"}, + {"direct_convolution_nhwc", "nhwc/direct_convolution.cl"}, + {"direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl"}, + {"im2col3x3_nhwc", "nhwc/im2col.cl"}, + {"im2col9x9_nhwc", "nhwc/im2col.cl"}, + {"im2col_generic_nhwc", "nhwc/im2col.cl"}, + {"indirect_convolution_nhwc", "nhwc/indirect_convolution.cl"}, + {"indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl"}, + {"normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl"}, + {"normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl"}, + {"normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl"}, + {"normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl"}, + {"pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl"}, + {"pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl"}, + {"pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl"}, + {"pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl"}, + {"pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl"}, + {"reorg_layer_nhwc", "nhwc/reorg_layer.cl"}, + {"scale_nearest_neighbour_nhwc", "nhwc/scale.cl"}, + {"scale_bilinear_nhwc", "nhwc/scale.cl"}, + {"space_to_batch_nhwc", "nhwc/space_to_batch.cl"}, + {"space_to_batch_static_nhwc", "nhwc/space_to_batch.cl"}, + {"space_to_depth_nhwc", "nhwc/space_to_depth.cl"}, + {"transposed_convolution_nhwc", "nhwc/transposed_convolution.cl"}, + {"upsample_layer_nhwc", "nhwc/upsample_layer.cl"}, + {"winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl"}, #endif /* ENABLE_NHWC_KERNELS */ }; -const std::map ClKernelLibrary::_program_source_map = -{ +const std::map ClKernelLibrary::_program_source_map = { #ifdef EMBEDDED_KERNELS { "activation_float_helpers.h", @@ -996,7 +985,7 @@ std::string ClKernelLibrary::program_name(const std::string &kernel_name) const // Find which program contains the kernel auto kernel_program_it = _kernel_program_map.find(kernel_name); - if(_kernel_program_map.end() == kernel_program_it) + if (_kernel_program_map.end() == kernel_program_it) { ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); } @@ -1022,14 +1011,14 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr #ifdef EMBEDDED_KERNELS #ifdef ARM_COMPUTE_COMPRESSED_KERNELS const auto inflatted_program_source_it = _decompressed_source_map.find(program_name); - if(inflatted_program_source_it != _decompressed_source_map.end()) + if (inflatted_program_source_it != _decompressed_source_map.end()) { - return ClProgramInfo{ inflatted_program_source_it->second, false }; + return ClProgramInfo{inflatted_program_source_it->second, false}; } #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ const auto program_source_it = _program_source_map.find(program_name); - if(program_source_it == _program_source_map.end()) + if (program_source_it == _program_source_map.end()) { ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str()); } @@ -1042,7 +1031,7 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr program_source = std::move(decompressed_program_source); #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ - return ClProgramInfo{ program_source, false }; + return ClProgramInfo{program_source, false}; #else /* EMBEDDED_KERNELS */ // Check for binary std::string source_name = _kernel_path + program_name; @@ -1050,12 +1039,12 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr std::string program_source{}; bool is_binary = false; - if(std::ifstream(binary_name).is_open()) + if (std::ifstream(binary_name).is_open()) { program_source = read_file(binary_name, true); is_binary = true; } - else if(std::ifstream(source_name).is_open()) + else if (std::ifstream(source_name).is_open()) { program_source = read_file(source_name, false); } @@ -1064,7 +1053,7 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str()); } - return ClProgramInfo{ program_source, is_binary }; + return ClProgramInfo{program_source, is_binary}; #endif /* EMBEDDED_KERNELS */ } } // namespace opencl diff --git a/src/gpu/cl/ClKernelLibrary.h b/src/gpu/cl/ClKernelLibrary.h index 42bec95032..cd1d689199 100644 --- a/src/gpu/cl/ClKernelLibrary.h +++ b/src/gpu/cl/ClKernelLibrary.h @@ -52,8 +52,8 @@ public: /** Structure to encapsulte program related information */ struct ClProgramInfo { - std::string program{}; /**< Program raw string */ - bool is_binary{ false }; /**< Flag that indicates if is in binary format */ + std::string program{}; /**< Program raw string */ + bool is_binary{false}; /**< Flag that indicates if is in binary format */ }; public: @@ -84,10 +84,12 @@ public: std::string program_name(const std::string &kernel_name) const; private: - std::string _kernel_path{}; /**< Path to the kernels folder. */ - mutable std::map _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */ - static const std::map _kernel_program_map; /**< Map that associates kernel names with programs. */ - static const std::map _program_source_map; /**< Contains sources for all programs. + std::string _kernel_path{}; /**< Path to the kernels folder. */ + mutable std::map + _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */ + static const std::map + _kernel_program_map; /**< Map that associates kernel names with programs. */ + static const std::map _program_source_map; /**< Contains sources for all programs. Used for compile-time kernel inclusion. >*/ }; } // namespace opencl diff --git a/src/gpu/cl/ClQueue.cpp b/src/gpu/cl/ClQueue.cpp index 2123adcf39..0cb7af5b61 100644 --- a/src/gpu/cl/ClQueue.cpp +++ b/src/gpu/cl/ClQueue.cpp @@ -36,7 +36,7 @@ namespace { CLTunerMode map_tuner_mode(AclTuningMode mode) { - switch(mode) + switch (mode) { case AclRapid: return CLTunerMode::RAPID; @@ -55,7 +55,7 @@ CLTunerMode map_tuner_mode(AclTuningMode mode) std::unique_ptr populate_tuner(const AclQueueOptions *options) { - if(options == nullptr || options->mode == AclTuningModeNone) + if (options == nullptr || options->mode == AclTuningModeNone) { return nullptr; } @@ -68,8 +68,7 @@ std::unique_ptr populate_tuner(const AclQueueOptions *options) } } // namespace -ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options) - : IQueue(ctx), _tuner(nullptr) +ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx), _tuner(nullptr) { _tuner = populate_tuner(options); } diff --git a/src/gpu/cl/ClQueue.h b/src/gpu/cl/ClQueue.h index b16a0f4e83..09ffb06cf3 100644 --- a/src/gpu/cl/ClQueue.h +++ b/src/gpu/cl/ClQueue.h @@ -24,10 +24,10 @@ #ifndef SRC_GPU_CLQUEUE_H #define SRC_GPU_CLQUEUE_H -#include "src/common/IQueue.h" - #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/IQueue.h" + #include namespace arm_compute diff --git a/src/gpu/cl/ClTensor.cpp b/src/gpu/cl/ClTensor.cpp index 0df07813e3..27422a4130 100644 --- a/src/gpu/cl/ClTensor.cpp +++ b/src/gpu/cl/ClTensor.cpp @@ -31,8 +31,7 @@ namespace gpu { namespace opencl { -ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc) - : ITensorV2(ctx), _legacy_tensor() +ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor() { ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::GpuOcl)); _legacy_tensor = std::make_unique(); @@ -43,7 +42,7 @@ void *ClTensor::map() { ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr); - if(_legacy_tensor == nullptr) + if (_legacy_tensor == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:map]: Backing tensor does not exist!"); return nullptr; @@ -57,7 +56,7 @@ StatusCode ClTensor::unmap() { ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr); - if(_legacy_tensor == nullptr) + if (_legacy_tensor == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:unmap]: Backing tensor does not exist!"); return StatusCode::RuntimeError; diff --git a/src/gpu/cl/ClTensor.h b/src/gpu/cl/ClTensor.h index 99d228c0b8..70184cd4bd 100644 --- a/src/gpu/cl/ClTensor.h +++ b/src/gpu/cl/ClTensor.h @@ -24,10 +24,10 @@ #ifndef SRC_GPU_CLTENSOR_H #define SRC_GPU_CLTENSOR_H -#include "src/common/ITensorV2.h" - #include "arm_compute/runtime/CL/CLTensor.h" +#include "src/common/ITensorV2.h" + namespace arm_compute { namespace gpu @@ -54,7 +54,7 @@ public: void *map() override; StatusCode unmap() override; arm_compute::ITensor *tensor() const override; - StatusCode import(void *handle, ImportMemoryType type) override; + StatusCode import(void *handle, ImportMemoryType type) override; private: std::unique_ptr _legacy_tensor; @@ -63,4 +63,4 @@ private: } // namespace gpu } // namespace arm_compute -#endif /* SRC_GPU_CLTENSOR_H */ \ No newline at end of file +#endif /* SRC_GPU_CLTENSOR_H */ diff --git a/src/gpu/cl/IClKernel.h b/src/gpu/cl/IClKernel.h index 52ea3c9183..4f07e9ad68 100644 --- a/src/gpu/cl/IClKernel.h +++ b/src/gpu/cl/IClKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_ICL_KERNEL_H #include "arm_compute/core/ITensorInfo.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute diff --git a/src/gpu/cl/kernels/ClActivationKernel.cpp b/src/gpu/cl/kernels/ClActivationKernel.cpp index ab1543729f..a85296f7cd 100644 --- a/src/gpu/cl/kernels/ClActivationKernel.cpp +++ b/src/gpu/cl/kernels/ClActivationKernel.cpp @@ -28,14 +28,14 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" - #include "support/StringSupport.h" #include @@ -51,36 +51,47 @@ namespace Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM16, DataType::F16, DataType::F32); - static std::set quantized_supported_activations = - { - ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LOGISTIC, - ActivationLayerInfo::ActivationFunction::TANH, - ActivationLayerInfo::ActivationFunction::HARD_SWISH, + static std::set quantized_supported_activations = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC, + ActivationLayerInfo::ActivationFunction::TANH, ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::LEAKY_RELU, }; - const DataType data_type = src->data_type(); - const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); - const ActivationLayerInfo::ActivationFunction f_act = act_info.activation(); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0), - "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); - - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128))); - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0))); - - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128))); + const DataType data_type = src->data_type(); + const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); + const ActivationLayerInfo::ActivationFunction f_act = act_info.activation(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && + (quantized_supported_activations.count(f_act) == 0), + "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and " + "lower/upper bounded relu are supported"); + + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 128.f, 128))); + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 256.f, 0))); + + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 128.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 256.f, -128))); // Checks performed when destination is configured - if((dst != nullptr) && (dst->total_size() != 0)) + if ((dst != nullptr) && (dst->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); @@ -95,15 +106,18 @@ ClActivationKernel::ClActivationKernel() _type = CLKernelType::ELEMENTWISE; } -void ClActivationKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info) +void ClActivationKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); _run_in_place = (dst == nullptr) || (dst == src); - if(dst != nullptr) + if (dst != nullptr) { // Destination auto inizialitation if not yet initialized auto_init_if_empty(*dst, *src->clone()); @@ -119,11 +133,10 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen const ActivationLayerInfo::ActivationFunction f_act = act_info.activation(); const bool is_quantized = is_data_type_quantized(dt); - const bool perform_activation_in_float = - (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - || (f_act == ActivationLayerInfo::ActivationFunction::TANH) - || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - || (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU); + const bool perform_activation_in_float = (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) || + (f_act == ActivationLayerInfo::ActivationFunction::TANH) || + (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) || + (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU); // Set build options CLBuildOptions build_opts; @@ -132,22 +145,23 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act))); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); std::string kernel_name = std::string("activation_layer"); // Set quantization info build options - if(is_quantized) + if (is_quantized) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); - if(!perform_activation_in_float) + if (!perform_activation_in_float) { int a_const_int = 0; int b_const_int = 0; // Create quantized version of constants a, b if needed - switch(dt) + switch (dt) { case DataType::QASYMM8: { @@ -180,22 +194,25 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen } // Quantized value of 0 corresponds to the offset o1 - build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0"))); + build_opts.add_option( + ("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0"))); build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale))); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset)); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), + "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset)); // Set correct kernel name kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant"); // Set scale and offset of the source and destination if they have different quantization info - if(dst != nullptr) + if (dst != nullptr) { const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - if(iq_info != oq_info) + if (iq_info != oq_info) { build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale))); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset)); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), + "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset)); } } } @@ -235,8 +252,9 @@ void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst); Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); @@ -246,13 +264,12 @@ void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl { unsigned int idx = 0; add_3D_tensor_argument(idx, src, slice); - if(!_run_in_place) + if (!_run_in_place) { add_3D_tensor_argument(idx, dst, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClActivationKernel.h b/src/gpu/cl/kernels/ClActivationKernel.h index 82e35b6104..ab7607bb82 100644 --- a/src/gpu/cl/kernels/ClActivationKernel.h +++ b/src/gpu/cl/kernels/ClActivationKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -51,7 +52,10 @@ public: * @param[out] dst Destination tensor info. Data type supported: same as @p src * @param[in] act_info Activation layer information. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + ActivationLayerInfo act_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClActivationKernel::configure() @@ -64,7 +68,7 @@ public: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; private: - bool _run_in_place{ false }; + bool _run_in_place{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp index 3d8ecf1fcc..a853f6bc1b 100644 --- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp +++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -66,12 +66,15 @@ ClBatchConcatenateKernel::ClBatchConcatenateKernel() _type = CLKernelType::ELEMENTWISE; } -void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst) +void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + unsigned int batch_offset, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); _batch_offset = batch_offset; @@ -81,8 +84,9 @@ void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -136,8 +140,9 @@ void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window slice = window.first_slice_window_3D(); @@ -152,9 +157,8 @@ void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } -} // namespace opencl } // namespace kernels +} // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h index f6b7c0ed09..549576b628 100644 --- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h +++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h @@ -53,7 +53,8 @@ public: * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClBatchConcatenateKernel::configure() @@ -66,7 +67,7 @@ public: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; private: - unsigned int _batch_offset{ 0 }; + unsigned int _batch_offset{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp index f621ad62d7..9ca35634f4 100644 --- a/src/gpu/cl/kernels/ClCastKernel.cpp +++ b/src/gpu/cl/kernels/ClCastKernel.cpp @@ -32,10 +32,10 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -52,20 +52,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src == dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16, - DataType::U16, DataType::U32, DataType::S32, DataType::F16, - DataType::F32, DataType::S64, DataType::U64); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16, - DataType::U16, DataType::U32, DataType::S32, DataType::F16, - DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, + DataType::S16, DataType::U16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32, DataType::S64, DataType::U64); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8, + DataType::S16, DataType::U16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different"); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } @@ -79,7 +76,10 @@ ClCastKernel::ClCastKernel() _type = CLKernelType::ELEMENTWISE; } -void ClCastKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) +void ClCastKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + ConvertPolicy policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -88,7 +88,7 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, const ITen ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Get data sizes const size_t src_size = data_size_from_type(src->data_type()); @@ -100,12 +100,14 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, const ITen // Set build options CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type())); // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE"); - build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), "-DIS_DATA_TYPE_FLOAT"); + build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), + "-DIS_DATA_TYPE_FLOAT"); build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED"); // Create kernel @@ -148,8 +150,9 @@ void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::Comm ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -162,8 +165,7 @@ void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::Comm add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClCastKernel.h b/src/gpu/cl/kernels/ClCastKernel.h index a021b3c78c..07b0b61443 100644 --- a/src/gpu/cl/kernels/ClCastKernel.h +++ b/src/gpu/cl/kernels/ClCastKernel.h @@ -64,7 +64,8 @@ public: * @param[out] dst The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. * @param[in] policy Conversion policy */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + void + configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCastKernel::configure() diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.cpp b/src/gpu/cl/kernels/ClCol2ImKernel.cpp index 3316742912..9972e07f05 100644 --- a/src/gpu/cl/kernels/ClCol2ImKernel.cpp +++ b/src/gpu/cl/kernels/ClCol2ImKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -47,29 +48,38 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &convolved_dims, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); // Checks performed when output is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW, + "Col2Im output's data layout must always be NCHW"); } return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +std::pair +validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW)); + auto_init_if_empty(*dst, src->clone() + ->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups)) + .set_data_layout(DataLayout::NCHW)); constexpr unsigned int num_elems_read_per_iteration = 8; @@ -80,18 +90,22 @@ std::pair validate_and_configure_window(ITensorInfo *src, ITenso AccessWindowHorizontal input_access(src, 0, num_elems_read_per_iteration); bool window_changed = update_window_and_padding(win, input_access); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace -ClCol2ImKernel::ClCol2ImKernel() - : _convolved_dims() +ClCol2ImKernel::ClCol2ImKernel() : _convolved_dims() { _type = CLKernelType::ELEMENTWISE; } -void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +void ClCol2ImKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const Size2D &convolved_dims, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -132,11 +146,15 @@ void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorI _config_id += support::cpp11::to_string(dst->dimension(1)); } -Status ClCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +Status ClCol2ImKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &convolved_dims, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, convolved_dims, num_groups)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first); return Status{}; } @@ -168,8 +186,7 @@ void ClCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm add_3D_tensor_argument(idx, src, slice); add_4D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out)); + } while (collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.h b/src/gpu/cl/kernels/ClCol2ImKernel.h index e19b7c8e16..34194aba01 100644 --- a/src/gpu/cl/kernels/ClCol2ImKernel.h +++ b/src/gpu/cl/kernels/ClCol2ImKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_COL2IM_KERNEL_H #include "arm_compute/core/Size2D.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -68,14 +69,19 @@ public: * @param[in] convolved_dims Output convolved dimensions. * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const Size2D &convolved_dims, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration * * Similar to ClCol2ImKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1); + static Status + validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp index 716dec1f30..85d3c3939c 100644 --- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp +++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -45,17 +46,21 @@ ClConvertFullyConnectedWeightsKernel::ClConvertFullyConnectedWeightsKernel() _type = CLKernelType::ELEMENTWISE; } -void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, - DataLayout data_layout) +void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output tensor auto initialisation if not yet initialized auto_init_if_empty(*dst, *src->clone()); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); - ARM_COMPUTE_ERROR_THROW_ON(ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout)); + ARM_COMPUTE_ERROR_THROW_ON( + ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout)); const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW; @@ -85,8 +90,10 @@ void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &com ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, - DataLayout data_layout) +Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); @@ -96,7 +103,7 @@ Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, co ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN); // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -110,8 +117,9 @@ void ClConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Wi ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); unsigned int idx = 0; diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h index 16000e82f6..0ddb54561a 100644 --- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h +++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h @@ -55,14 +55,21 @@ public: * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClConvertFullyConnectedWeightsKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClCopyKernel.cpp b/src/gpu/cl/kernels/ClCopyKernel.cpp index 4719448819..c80ef664f5 100644 --- a/src/gpu/cl/kernels/ClCopyKernel.cpp +++ b/src/gpu/cl/kernels/ClCopyKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,11 +51,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); // Validate dst if initialized - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - if(dst_window == nullptr) + if (dst_window == nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape()); } @@ -74,12 +75,15 @@ ClCopyKernel::ClCopyKernel() _type = CLKernelType::ELEMENTWISE; } -void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window) +void ClCopyKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Create kernel CLBuildOptions build_opts; @@ -93,7 +97,7 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen const Window win_config = calculate_max_window(*src, Steps(vec_size_x)); - if(dst_window != nullptr) + if (dst_window != nullptr) { _has_dst_window = true; _dst_window = Window(*dst_window); @@ -101,9 +105,11 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen const int vec_size_x_leftover = width_x % vec_size_x; const bool multi_access_x = width_x >= static_cast(vec_size_x); - if(multi_access_x) + if (multi_access_x) { - _dst_window.set(Window::DimX, Window::Dimension(dst_window->x().start(), ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x)); + _dst_window.set(Window::DimX, + Window::Dimension(dst_window->x().start(), + ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x)); } build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover)); @@ -127,7 +133,8 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window) +Status +ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window)); @@ -139,12 +146,13 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window slice; - if(_has_dst_window) + if (_has_dst_window) { slice = window.first_slice_window_3D(); Window out_slice = _dst_window.first_slice_window_3D(); @@ -154,8 +162,7 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, out_slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice)); + } while (window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice)); } else { @@ -167,8 +174,7 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } } // namespace kernels diff --git a/src/gpu/cl/kernels/ClCopyKernel.h b/src/gpu/cl/kernels/ClCopyKernel.h index 63fd806586..f915bf672d 100644 --- a/src/gpu/cl/kernels/ClCopyKernel.h +++ b/src/gpu/cl/kernels/ClCopyKernel.h @@ -47,7 +47,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src. * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Window *dst_window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCopyKernel::configure() diff --git a/src/gpu/cl/kernels/ClCropKernel.cpp b/src/gpu/cl/kernels/ClCropKernel.cpp index 87ad6b49d9..0c503e13fc 100644 --- a/src/gpu/cl/kernels/ClCropKernel.cpp +++ b/src/gpu/cl/kernels/ClCropKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" @@ -46,8 +47,14 @@ ClCropKernel::ClCropKernel() _type = CLKernelType::ELEMENTWISE; } -void ClCropKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, - float extrapolation_value, Window *dst_window) +void ClCropKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window)); @@ -60,7 +67,7 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen // Create and update the window (if needed) Window win = calculate_max_window(*dst); - if(dst_window != nullptr) + if (dst_window != nullptr) { ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window); win = *dst_window; @@ -70,7 +77,7 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen const bool multi_access_x = dst_width_x >= vec_size_x; const bool remainder_x = dst_width_x % vec_size_x > 0; - if(multi_access_x) + if (multi_access_x) { win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); @@ -81,13 +88,21 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(dst_width_x - vec_size_x, 0))); + build_opts.add_option_if(multi_access_x && remainder_x, + "-DLAST_ACCESSED_X=" + + support::cpp11::to_string(std::max(dst_width_x - vec_size_x, 0))); build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED="); build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED="); _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options()); } -Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) +Status ClCropKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { ARM_COMPUTE_UNUSED(extrapolation_value, dst_window); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); @@ -95,14 +110,15 @@ Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Co ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0); - ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast(src->dimension(1)) || start.y >= static_cast(src->dimension(2)) - || end.x >= static_cast(src->dimension(1)) || end.y >= static_cast(src->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON( + start.x >= static_cast(src->dimension(1)) || start.y >= static_cast(src->dimension(2)) || + end.x >= static_cast(src->dimension(1)) || end.y >= static_cast(src->dimension(2))); ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3)); - if(dst_window != nullptr) + if (dst_window != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1); } - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); @@ -116,12 +132,15 @@ void ClCropKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window in_slice = Window(); in_slice.use_tensor_dimensions(src->info()->tensor_shape()); - in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step())); + in_slice.set(Window::DimX, + Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), + window.x().step())); in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1)); unsigned int idx = 0; diff --git a/src/gpu/cl/kernels/ClCropKernel.h b/src/gpu/cl/kernels/ClCropKernel.h index 2f166e184c..506262608c 100644 --- a/src/gpu/cl/kernels/ClCropKernel.h +++ b/src/gpu/cl/kernels/ClCropKernel.h @@ -53,16 +53,27 @@ public: * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCropKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp index a05cd1321e..ec44d88f01 100644 --- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp +++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -48,7 +48,8 @@ Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, con { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); @@ -60,18 +61,20 @@ Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, con } } // namespace -ClDepthConcatenateKernel::ClDepthConcatenateKernel() - : _depth_offset(0) +ClDepthConcatenateKernel::ClDepthConcatenateKernel() : _depth_offset(0) { _type = CLKernelType::ELEMENTWISE; } -void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst) +void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + unsigned int depth_offset, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); _depth_offset = depth_offset; @@ -81,8 +84,9 @@ void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -122,8 +126,9 @@ void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window slice = window.first_slice_window_3D(); @@ -138,8 +143,7 @@ void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h index 4739677f3b..539f010303 100644 --- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h +++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h @@ -53,7 +53,8 @@ public: * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClDepthConcatenateKernel::configure() diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/gpu/cl/kernels/ClDequantizeKernel.cpp index 756cd56a8b..53429ab1aa 100644 --- a/src/gpu/cl/kernels/ClDequantizeKernel.cpp +++ b/src/gpu/cl/kernels/ClDequantizeKernel.cpp @@ -34,7 +34,6 @@ #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -49,9 +48,11 @@ namespace Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, + DataType::QSYMM16); - if(dst->tensor_shape().total_size() > 0) + if (dst->tensor_shape().total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32); @@ -74,7 +75,7 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen // Output tensor auto initialization if not yet initialized auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); @@ -87,7 +88,7 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen // Create kernel CLBuildOptions build_opts; - if(!is_quantized_per_channel) + if (!is_quantized_per_channel) { const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); const int qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0; @@ -103,16 +104,18 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(output_width_x - vec_size_x, 0))); + build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string( + std::max(output_width_x - vec_size_x, 0))); // Create kernel name _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window Window win = calculate_max_window(*dst); - if(multi_access_x) + if (multi_access_x) { - win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); @@ -136,10 +139,11 @@ void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl:: const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type()); // Collapse windo - Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3); + Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) + : window.collapse_if_possible(ICLKernel::window(), 3); Window slice = new_window.first_slice_window_3D(); - if(is_quantized_per_channel) + if (is_quantized_per_channel) { unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters _kernel.setArg(idx++, src->quantization().scale->cl_buffer()); @@ -151,8 +155,7 @@ void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl:: add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(new_window.slide_window_slice_3D(slice)); + } while (new_window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp index 7ad398412a..7cf1958c1b 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp @@ -23,17 +23,18 @@ */ #include "src/gpu/cl/kernels/ClDirectConv2dKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" @@ -51,11 +52,17 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); const DataLayout data_layout = src->data_layout(); @@ -63,41 +70,56 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true, "Export to CLImage is not supported for the input tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true, "Export to CLImage is not supported for the output tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true, + "Export to CLImage is not supported for the input tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true, + "Export to CLImage is not supported for the output tensor"); - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9) && std::get<0>(conv_info.stride()) > 2, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), + "Weights should have same width and height"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, + "Strides larger than 3 not supported for 1x1 convolution."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || + weights->dimension(width_idx) == 9) && + std::get<0>(conv_info.stride()) > 2, "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled(), "Fused activation is not supported for NCHW layout"); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9, - "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && + weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9, + "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types"); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5, - "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && + weights->dimension(width_idx) != 5, + "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types"); } } - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()), "Fused activation in NHWC is only supported for floating point."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()), + "Fused activation in NHWC is only supported for floating point."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, + "M0 can only be greater than 0 and less than or equal to 8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && + desc.n0 != 16, "N0 can only be: 1, 2, 3, 4, 8, and 16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && + desc.k0 != 16, "K0 can only be: 1, 2, 3, 4, 8, and 16"); - if(desc.export_weights_to_cl_image) + if (desc.export_weights_to_cl_image) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, "K0 can only be: 4, 8, and 16"); @@ -106,9 +128,9 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } } - if(biases != nullptr) + if (biases != nullptr) { - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -118,20 +140,19 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3), "Biases size and number of dst feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, - "Biases should be one dimensional"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional"); } // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); } const auto data_type = src->data_type(); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); @@ -140,7 +161,8 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); } return Status{}; } @@ -151,8 +173,14 @@ ClDirectConv2dKernel::ClDirectConv2dKernel() _type = CLKernelType::DIRECT; } -void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); @@ -178,14 +206,11 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, output_shape, - 1, - src->data_type(), - src->quantization_info()); + auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); // Configure kernel window Window win; - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { output_shape.collapse(2U, 1U); const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]); @@ -194,7 +219,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT // Create window and update padding win = calculate_max_window(output_shape, Steps(n0, m0)); } - else if(_data_layout == DataLayout::NCHW) + else if (_data_layout == DataLayout::NCHW) { _num_elems_processed_per_iteration = 1u; win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration)); @@ -205,7 +230,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT std::stringstream kernel_name; CLBuildOptions build_options; - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { kernel_name << "direct_convolution_nhwc"; @@ -221,22 +246,22 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT _export_output_to_cl_image = desc.export_output_to_cl_image; // Update the padding for the weights tensor if we can export to cl_image - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { gemm::update_padding_for_cl_image(weights); } - if(_export_output_to_cl_image) + if (_export_output_to_cl_image) { gemm::update_padding_for_cl_image(dst); } - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { gemm::update_padding_for_cl_image(src); } - if(biases != nullptr) + if (biases != nullptr) { build_options.add_option(std::string("-DHAS_BIAS")); build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type()))); @@ -246,9 +271,10 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT const auto act_function = act_info.activation(); const auto dst_data_type = dst->data_type(); - if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16)) + if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || + act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && + (dst_data_type == DataType::F32 || dst_data_type == DataType::F16)) { // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations @@ -259,7 +285,8 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-cl-fast-relaxed-math"); } - build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER"); + build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", + "-DSRC_TENSOR_TYPE=BUFFER"); build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0))); build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1))); @@ -267,9 +294,11 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0))); build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1))); build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2))); - build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE", "-DDST_TENSOR_TYPE=BUFFER"); + build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE", + "-DDST_TENSOR_TYPE=BUFFER"); build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type)); - build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); + build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", + "-DWEI_TENSOR_TYPE=BUFFER"); build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx))); build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type())); @@ -284,7 +313,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP"); build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function))); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); @@ -314,11 +343,13 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0)); build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0)); build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0)); - build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); - build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); + build_options.add_option_if(act_info.enabled(), + "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); + build_options.add_option_if(act_info.enabled(), + "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); } - if(compile_context.get_ddk_version() >= 30) + if (compile_context.get_ddk_version() >= 30) { build_options.add_option("-fregister-allocation=64"); } @@ -340,13 +371,17 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); - build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); + build_options.add_option( + std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x))); build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type))); - build_options.add_option(std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration))); - build_options.add_option(std::string("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration))); + build_options.add_option( + std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration))); + build_options.add_option( + std::string("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration))); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); @@ -405,8 +440,13 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT _config_id += lower_string(string_from_data_layout(_data_layout)); } -Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +Status ClDirectConv2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc)); return Status{}; @@ -420,52 +460,55 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl // Get initial windows Window slice = window.first_slice_window_3D(); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto weights = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto biases = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto weights = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto biases = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { cl::Image2D weights_cl_image; cl::Image2D output_cl_image; cl::Image2D input_cl_image; - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { // Export tensor to cl_image weights_cl_image = create_image2d_from_tensor(weights, CLImage2DType::ReadOnly); } - if(_export_output_to_cl_image) + if (_export_output_to_cl_image) { // Export tensor to cl_image output_cl_image = create_image2d_from_tensor(dst, CLImage2DType::WriteOnly); } - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { // Export tensor to cl_image input_cl_image = create_image2d_from_tensor(src, CLImage2DType::ReadOnly); } unsigned int idx = 0; - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { _kernel.setArg(idx++, input_cl_image); } add_4d_tensor_nhwc_argument(idx, src); - if(_export_output_to_cl_image) + if (_export_output_to_cl_image) { _kernel.setArg(idx++, output_cl_image); } add_4d_tensor_nhwc_argument(idx, dst); - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { _kernel.setArg(idx++, weights_cl_image); } add_4d_tensor_nhwc_argument(idx, weights); - if(biases != nullptr) + if (biases != nullptr) { add_1D_tensor_argument(idx, biases, slice); } @@ -476,7 +519,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); add_3D_tensor_argument(idx1, weights, slice); - if(biases != nullptr) + if (biases != nullptr) { Window slice_biases; slice_biases.use_tensor_dimensions(biases->info()->tensor_shape()); @@ -491,8 +534,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } } // namespace kernels diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h index 7132762b35..c934c825ca 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.h +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -68,16 +69,27 @@ public: * @param[in] act_info Contains activaton information described in @ref ActivationLayerInfo. * @param[in] desc Direct convolution descriptor used to build the NHWC direct convolution kernel. For NCHW, this parameter is ignored. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc); /** Static function to check if given info will lead to a valid configuration * * Similar to ClDirectConv2dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -85,9 +97,9 @@ public: public: DataLayout _data_layout{}; PadStrideInfo _conv_info{}; - bool _export_weights_to_cl_image{ false }; - bool _export_output_to_cl_image{ false }; - bool _export_input_to_cl_image{ false }; + bool _export_weights_to_cl_image{false}; + bool _export_output_to_cl_image{false}; + bool _export_input_to_cl_image{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp index 6191178911..8002520a87 100644 --- a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp +++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" @@ -40,7 +41,11 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported"); @@ -49,20 +54,25 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv3d_info.act_info.enabled(), "Fused activation not supported"); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); ARM_COMPUTE_RETURN_ERROR_ON(conv3d_info.dilation != Size3D(1U, 1U, 1U)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 5, "Weights can be at most 5 dimensional"); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) > (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right)); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) > (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom)); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) > (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) > + (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) > + (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) > + (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back)); - if(src2 != nullptr) + if (src2 != nullptr) { - if(is_data_type_quantized(src0->data_type())) + if (is_data_type_quantized(src0->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32); } @@ -70,15 +80,18 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), + "Biases size and number of dst feature maps should match"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional"); } // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src1->dimension(0), "Weights and dst OFMs should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), + misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); } @@ -91,8 +104,12 @@ ClDirectConv3dKernel::ClDirectConv3dKernel() _type = CLKernelType::DIRECT; } -void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, - const Conv3dInfo &conv3d_info) +void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); @@ -149,13 +166,13 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co build_options.add_option("-DK0=" + support::cpp11::to_string(k0)); build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); - if(src2 != nullptr) + if (src2 != nullptr) { build_options.add_option(std::string("-DHAS_BIAS")); build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(src2->data_type()))); } - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { const UniformQuantizationInfo iqinfo = src0->quantization_info().uniform(); const UniformQuantizationInfo wqinfo = src1->quantization_info().uniform(); @@ -218,7 +235,11 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co _config_id += support::cpp11::to_string(dst_channels); } -Status ClDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info) +Status ClDirectConv3dKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv3d_info)); return Status{}; @@ -229,21 +250,28 @@ void ClDirectConv3dKernel::run_op(ITensorPack &tensors, const Window &window, cl ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto weights = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto biases = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto weights = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto biases = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); // Get initial windows Window slice = window.first_slice_window_3D(); - slice.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) * dst->info()->dimension(3), slice.y().step()), slice.y().step())); + slice.set(Window::DimY, Window::Dimension(0, + ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) * + dst->info()->dimension(3), + slice.y().step()), + slice.y().step())); slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(4), 1)); unsigned int idx = 0; add_4D_tensor_argument(idx, src, slice); add_4D_tensor_argument(idx, dst, slice); add_4D_tensor_argument(idx, weights, slice); - if(biases != nullptr) + if (biases != nullptr) { add_1D_tensor_argument(idx, biases, slice); } diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.h b/src/gpu/cl/kernels/ClDirectConv3dKernel.h index de4f0ce216..cb7509d8fa 100644 --- a/src/gpu/cl/kernels/ClDirectConv3dKernel.h +++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.h @@ -70,14 +70,23 @@ public: * @param[out] dst Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts. * @param[in] conv3d_info Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClDirectConv3dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/gpu/cl/kernels/ClElementwiseKernel.cpp index 6beee576b5..cdb3527a92 100644 --- a/src/gpu/cl/kernels/ClElementwiseKernel.cpp +++ b/src/gpu/cl/kernels/ClElementwiseKernel.cpp @@ -23,18 +23,20 @@ */ #include "src/gpu/cl/kernels/ClElementwiseKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/common/utils/Validate.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" #include "support/StringSupport.h" + #include namespace arm_compute @@ -47,25 +49,20 @@ namespace { constexpr unsigned int vector_size_byte_opencl = 16; -std::map supported_arithmetic_ops = -{ - { ArithmeticOperation::ADD, "ADD" }, - { ArithmeticOperation::SUB, "SUB" }, - { ArithmeticOperation::DIV, "DIV" }, - { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" }, - { ArithmeticOperation::MIN, "MIN" }, - { ArithmeticOperation::MAX, "MAX" }, - { ArithmeticOperation::POWER, "POWER" }, - { ArithmeticOperation::PRELU, "PRELU" }, +std::map supported_arithmetic_ops = { + {ArithmeticOperation::ADD, "ADD"}, {ArithmeticOperation::SUB, "SUB"}, + {ArithmeticOperation::DIV, "DIV"}, {ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF"}, + {ArithmeticOperation::MIN, "MIN"}, {ArithmeticOperation::MAX, "MAX"}, + {ArithmeticOperation::POWER, "POWER"}, {ArithmeticOperation::PRELU, "PRELU"}, }; -std::map supported_sat_arithmetic_ops = -{ - { ArithmeticOperation::ADD, "ADD" }, - { ArithmeticOperation::SUB, "SUB" }, +std::map supported_sat_arithmetic_ops = { + {ArithmeticOperation::ADD, "ADD"}, + {ArithmeticOperation::SUB, "SUB"}, }; -std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) +std::string +generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) { std::string config_id; // Set config_id for enabling LWS tuning @@ -79,12 +76,18 @@ std::string generate_id_for_tuning_common(const std::string &kernel_name, const return config_id; } -Status validate_in_place_output_shape(const bool in_place, const bool src1_in_place, const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const TensorShape &out_shape) +Status validate_in_place_output_shape(const bool in_place, + const bool src1_in_place, + const ITensorInfo &src1, + const ITensorInfo &src2, + const ITensorInfo &dst, + const TensorShape &out_shape) { - if(in_place) + if (in_place) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0), - "Wrong shape for dst, cannot do in_place calculation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0), + "Wrong shape for dst, cannot do in_place calculation"); } else { @@ -94,7 +97,9 @@ Status validate_in_place_output_shape(const bool in_place, const bool src1_in_pl return Status{}; } -Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, + const ITensorInfo &src2, + const ITensorInfo &dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1); @@ -110,11 +115,12 @@ Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); } return Status{}; @@ -136,25 +142,27 @@ Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITenso ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape)); } return Status{}; } -Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +Status +validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::F16, DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2); - if(is_data_type_quantized_symmetric(src1.data_type())) + if (is_data_type_quantized_symmetric(src1.data_type())) { const int32_t in1_offset = src1.quantization_info().uniform().offset; const int32_t in2_offset = src2.quantization_info().uniform().offset; @@ -170,13 +178,15 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst"); - ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), + "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); - if(is_data_type_quantized_symmetric(dst.data_type())) + if (is_data_type_quantized_symmetric(dst.data_type())) { const int32_t offset = dst.quantization_info().uniform().offset; ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero"); @@ -185,19 +195,26 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I return Status{}; } -CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const std::string &operation_string) +CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, + const ITensorInfo &src2, + const ITensorInfo &dst, + const std::string &operation_string) { CLBuildOptions build_opts; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1.data_type())); - build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_IN1=" + + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_IN2=" + + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DOP=" + operation_string); - if(is_data_type_quantized(src1.data_type())) + if (is_data_type_quantized(src1.data_type())) { const UniformQuantizationInfo iq1info = src1.quantization_info().uniform(); const UniformQuantizationInfo iq2info = src2.quantization_info().uniform(); @@ -223,14 +240,17 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &s std::pair configure_window_arithmetic_common(ITensorInfo &dst) { - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); - Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); + Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration)); return std::make_pair(Status{}, win); } -std::pair validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +std::pair +validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) { - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); + const std::pair broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(src1, src2); const TensorShape &out_shape = broadcast_pair.first; auto_init_if_empty(dst, out_shape, 1, src1.data_type()); @@ -238,9 +258,11 @@ std::pair validate_and_configure_window_for_arithmetic_operators return configure_window_arithmetic_common(dst); } -std::pair validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +std::pair +validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) { - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); + const std::pair broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(src1, src2); const TensorShape &out_shape = broadcast_pair.first; set_shape_if_empty(dst, out_shape); @@ -249,9 +271,11 @@ std::pair validate_and_configure_window_for_logical_binary_opera return configure_window_arithmetic_common(dst); } -std::pair validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +std::pair +validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) { - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); + const std::pair broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(src1, src2); const TensorShape &out_shape = broadcast_pair.first; auto_init_if_empty(dst, out_shape, 1, src1.data_type()); @@ -265,21 +289,24 @@ ClElementwiseKernel::ClElementwiseKernel() _type = CLKernelType::ELEMENTWISE; } -void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst) { // Configure kernel window auto win_config = validate_and_configure_window(*src1, *src2, *dst); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); std::string kernel_name = "elementwise_operation_" + name(); - if(is_data_type_quantized(src1->data_type())) + if (is_data_type_quantized(src1->data_type())) { kernel_name += "_quantized"; } // Set kernel build options CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst); - if(_act_info.enabled()) + if (_act_info.enabled()) { build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation()))); build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a())); @@ -299,9 +326,11 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src_0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src_0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src_1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst); @@ -311,17 +340,18 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c bool can_collapse = true; const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) { can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) { can_collapse = (in_shape1[d] == in_shape2[d]); } } bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; @@ -337,7 +367,7 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c unsigned int idx = 0; add_3D_tensor_argument(idx, src_0, slice_src1); add_3D_tensor_argument(idx, src_1, slice_src2); - if(!in_place) + if (!in_place) { add_3D_tensor_argument(idx, dst, slice); } @@ -345,13 +375,16 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c enqueue(queue, *this, slice, lws_hint()); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1)); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2)); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } /** Logical binary */ -void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, + LogicalOperation op, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst)); @@ -359,7 +392,10 @@ void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, L configure_common(compile_context, src1, src2, dst); } -Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +Status ClLogicalBinaryKernel::validate(LogicalOperation op, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst) { ARM_COMPUTE_UNUSED(op); ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not); @@ -369,14 +405,16 @@ Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *s ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()) + .first); return Status{}; } std::string ClLogicalBinaryKernel::name() { - switch(_op) + switch (_op) { case LogicalOperation::And: return "AND"; @@ -390,30 +428,38 @@ std::string ClLogicalBinaryKernel::name() return ""; } -std::pair ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +std::pair +ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) { return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst); } -CLBuildOptions ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +CLBuildOptions +ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) { // The arithmetic utility functions can be share return generate_build_options_with_arithmetic_rules(src1, src2, dst, name()); } -std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) +std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, + const ITensorInfo &src1, + const ITensorInfo &dst) { return generate_id_for_tuning_common(kernel_name, src1, dst); } /** Arithmetic operations with saturation*/ -void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, +void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, + ArithmeticOperation op, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info)); - auto padding_info = get_padding_info({ input1, input2, output }); + auto padding_info = get_padding_info({input1, input2, output}); _policy = policy; _op = op; @@ -422,24 +468,34 @@ void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_cont ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy, +Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, + const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ConvertPolicy &policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(op, policy); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()) + .first); ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type())); return Status{}; } -std::pair ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) +std::pair ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, + ITensorInfo &input2, + ITensorInfo &output) { return validate_and_configure_window_for_arithmetic_operators(input1, input2, output); } -CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) +CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, + const ITensorInfo &input2, + const ITensorInfo &output) { const bool has_float_out = is_data_type_float(output.data_type()); auto build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name()); @@ -447,7 +503,9 @@ CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensor return build_options; } -std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) +std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, + const ITensorInfo &input1, + const ITensorInfo &output) { auto config_id = generate_id_for_tuning_common(kernel_name, input1, output); config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_"; @@ -461,12 +519,16 @@ std::string ClSaturatedArithmeticKernel::name() } /** Arithmetic operations*/ -void ClArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, +void ClArithmeticKernel::configure(const ClCompileContext &compile_context, + ArithmeticOperation op, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info)); - auto padding_info = get_padding_info({ src1, src2, dst }); + auto padding_info = get_padding_info({src1, src2, dst}); _op = op; _act_info = act_info; @@ -474,33 +536,42 @@ void ClArithmeticKernel::configure(const ClCompileContext &compile_context, Arit ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClArithmeticKernel::validate(ArithmeticOperation op, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); - if(op == ArithmeticOperation::DIV) + if (op == ArithmeticOperation::DIV) { // Partial integer support S32/F32/F16 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); } - else if(op == ArithmeticOperation::POWER) + else if (op == ArithmeticOperation::POWER) { // Power operators doesn't support integer arithmetic ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); } else { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()) + .first); } ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); return Status{}; } -std::pair ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +std::pair +ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) { - if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER) + if (_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER) { // Division and Power operators don't support integer arithmetic return validate_and_configure_window_for_division(src1, src2, dst); @@ -511,11 +582,14 @@ std::pair ClArithmeticKernel::validate_and_configure_window(ITen } } -CLBuildOptions ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +CLBuildOptions +ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) { return generate_build_options_with_arithmetic_rules(src1, src2, dst, name()); } -std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) +std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, + const ITensorInfo &src1, + const ITensorInfo &dst) { return generate_id_for_tuning_common(kernel_name, src1, dst); } diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.h b/src/gpu/cl/kernels/ClElementwiseKernel.h index ea3ddb2124..73e54542b2 100644 --- a/src/gpu/cl/kernels/ClElementwiseKernel.h +++ b/src/gpu/cl/kernels/ClElementwiseKernel.h @@ -25,8 +25,9 @@ #define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" -#include "src/core/KernelTypes.h" + #include "src/core/common/Macros.h" +#include "src/core/KernelTypes.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -65,24 +66,28 @@ protected: * * @return a pair of Status and Window */ - virtual std::pair validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0; + virtual std::pair + validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0; /** Generate the build options for the specific kernel * * @reutrn a CLBuildOptions struct */ - virtual CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0; + virtual CLBuildOptions + generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0; /** Generate the identifier for tuning * * @reutrn a string */ - virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0; + virtual std::string + generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0; /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff) * */ - void configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); + void + configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); ActivationLayerInfo _act_info{}; }; @@ -100,23 +105,31 @@ public: * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. * @param[in] dst Destination tensor info. Data types supported: same as @p src1. */ - void configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); + void configure(const ClCompileContext &compile_context, + LogicalOperation op, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClLogicalBinaryKernel::configure() * * @return a status */ - static Status validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); + static Status + validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); private: // Inherited methods overridden: std::string name() override; - std::pair validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; - CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; - std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; - - LogicalOperation _op{ LogicalOperation::Unknown }; + std::pair + validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; + CLBuildOptions + generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; + std::string + generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; + + LogicalOperation _op{LogicalOperation::Unknown}; }; /** Addition operation */ @@ -135,7 +148,12 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, + void configure(const ClCompileContext &compile_context, + ArithmeticOperation op, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output, + const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration @@ -144,15 +162,23 @@ public: * * @return a status */ - static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy, + static Status validate(ArithmeticOperation op, + const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); protected: // Inherited methods overridden: std::string name() override; - std::pair validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override; - CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override; - std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override; + std::pair + validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override; + CLBuildOptions + generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override; + std::string generate_id_for_tuning(const std::string &kernel_name, + const ITensorInfo &input1, + const ITensorInfo &output) override; private: ConvertPolicy _policy{}; @@ -174,7 +200,11 @@ public: * @param[in] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, + void configure(const ClCompileContext &compile_context, + ArithmeticOperation op, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration @@ -183,14 +213,21 @@ public: * * @return a status */ - static Status validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(ArithmeticOperation op, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); protected: // Inherited methods overridden: std::string name() override; - std::pair validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; - CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; - std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; + std::pair + validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; + CLBuildOptions + generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; + std::string + generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; private: ArithmeticOperation _op{}; diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp index 744a3a40c7..f7c198ee54 100644 --- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp +++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp @@ -23,11 +23,12 @@ */ #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" -#include "arm_compute/core/Utils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" @@ -46,17 +47,18 @@ constexpr unsigned int vector_size_byte_opencl = 16; Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); - if(op == ElementWiseUnary::LOGICAL_NOT) + if (op == ElementWiseUnary::LOGICAL_NOT) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8); } - else if(op == ElementWiseUnary::NEG) + else if (op == ElementWiseUnary::NEG) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32); } - else if(op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT. + else if (op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT. { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); } else { @@ -64,7 +66,7 @@ Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const } // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); @@ -80,19 +82,23 @@ ClElementWiseUnaryKernel::ClElementWiseUnaryKernel() _type = CLKernelType::ELEMENTWISE; } -void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op) +void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const ElementWiseUnary &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op)); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0)); - std::string kernel_name = "elementwise_unary"; - const int vec_size_x = num_elems_processed_per_iteration; - const int dst_width_x = dst->dimension(0); - if(is_data_type_quantized(src->data_type())) + std::string kernel_name = "elementwise_unary"; + const int vec_size_x = num_elems_processed_per_iteration; + const int dst_width_x = dst->dimension(0); + if (is_data_type_quantized(src->data_type())) { kernel_name += "_quantized"; } @@ -101,7 +107,7 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option("-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(dst_width_x - vec_size_x, 0))); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); @@ -110,7 +116,7 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context build_opts.add_option("-DSCALE_IN=" + float_to_string_with_full_precision(iqinfo.scale)); build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale)); } - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: build_opts.add_option("-DOPERATION=rsqrt_op"); @@ -169,8 +175,9 @@ void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); do { @@ -178,8 +185,7 @@ void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h index 0f270f25e8..81721f8ca8 100644 --- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h +++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h @@ -47,7 +47,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src. * @param[in] op Element wise unary operation to perform. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const ElementWiseUnary &op); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementWiseUnaryKernel::configure() diff --git a/src/gpu/cl/kernels/ClFillKernel.cpp b/src/gpu/cl/kernels/ClFillKernel.cpp index a9345ee334..96ad503730 100644 --- a/src/gpu/cl/kernels/ClFillKernel.cpp +++ b/src/gpu/cl/kernels/ClFillKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -47,9 +48,10 @@ ClFillKernel::ClFillKernel() _type = CLKernelType::ELEMENTWISE; } -void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, - const PixelValue &constant_value, - Window *window) +void ClFillKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *window) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window)); @@ -60,7 +62,7 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf // Create and update the window (if needed) _full_window = calculate_max_window(*tensor); Window win = _full_window; - if(window != nullptr) + if (window != nullptr) { ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window); win = *window; @@ -70,9 +72,10 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf const bool multi_access_x = output_width_x >= vec_size_x; const bool remainder_x = output_width_x % vec_size_x > 0; - if(multi_access_x) + if (multi_access_x) { - win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); @@ -81,7 +84,9 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type)); build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(output_width_x - vec_size_x, 0))); + build_opts.add_option_if(multi_access_x && remainder_x, + "-DLAST_ACCESSED_X=" + + support::cpp11::to_string(std::max(output_width_x - vec_size_x, 0))); _kernel = create_kernel(compile_context, "memset", build_opts.options()); } @@ -89,7 +94,7 @@ Status ClFillKernel::validate(const ITensorInfo *tensor, const PixelValue &const { ARM_COMPUTE_UNUSED(tensor); ARM_COMPUTE_UNUSED(constant_value); - if(window != nullptr) + if (window != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1); } @@ -101,7 +106,8 @@ void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto tensor = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto tensor = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); // Collapse all the batches on the third Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ); @@ -112,8 +118,7 @@ void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman unsigned int idx = 0; add_3D_tensor_argument(idx, tensor, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClFillKernel.h b/src/gpu/cl/kernels/ClFillKernel.h index f25cf928ad..5d69fbfbd1 100644 --- a/src/gpu/cl/kernels/ClFillKernel.h +++ b/src/gpu/cl/kernels/ClFillKernel.h @@ -47,7 +47,10 @@ public: * @param[in] constant_value The value used to fill the planes of the tensor * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); + void configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClFillKernel::configure() diff --git a/src/gpu/cl/kernels/ClFloorKernel.cpp b/src/gpu/cl/kernels/ClFloorKernel.cpp index f9f834875a..358e84012b 100644 --- a/src/gpu/cl/kernels/ClFloorKernel.cpp +++ b/src/gpu/cl/kernels/ClFloorKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -52,7 +53,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); // Validate in case of configured output - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -76,9 +77,9 @@ void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITe // Validate ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); + const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); @@ -105,8 +106,9 @@ void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); @@ -117,8 +119,7 @@ void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp index accafeecc2..e0d925dfb2 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp @@ -29,14 +29,13 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "src/core/AccessWindowStatic.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -50,26 +49,35 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(src0->data_type() == DataType::QASYMM8) + if (src0->data_type() == DataType::QASYMM8) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); } else { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, + DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); const int m = gemm_info.m(); @@ -83,7 +91,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast(k)); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast(n)); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast(k)); - if(gemm_info.reinterpret_input_as_3d()) + if (gemm_info.reinterpret_input_as_3d()) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast(m)); } @@ -92,9 +100,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast(m)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + const TensorInfo tensor_info_dst = + dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); } @@ -102,8 +111,13 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed) +std::pair validate_and_configure_window(const ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info, + ElementsProcessed &num_elements_processed) { unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; @@ -115,17 +129,19 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_dst_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_dst_as_3d) + if (reinterpret_input_as_3d == reinterpret_dst_as_3d) { reinterpret_dst_as_3d = false; } // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); + auto_init_if_empty(*dst, src0->clone() + ->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)) + .set_data_type(DataType::S32)); TensorInfo tmp_info(*dst); - if(reinterpret_dst_as_3d) + if (reinterpret_dst_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -138,12 +154,12 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, num_elems_processed_per_iteration_x = rhs_info.n0; num_elems_processed_per_iteration_y = lhs_info.m0; - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // RHS matrix still needs padding on the X - AccessWindowStatic src1_access(src1, 0, 0, - ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), - src1->dimension(1)); + AccessWindowStatic src1_access( + src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1)); window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop @@ -153,7 +169,8 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); collapsed = win.collapse(win, dimension_to_collapse); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, collapsed); } } // namespace @@ -163,8 +180,13 @@ ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel() _type = CLKernelType::GEMM; } -void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); @@ -175,11 +197,11 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); // We still need padding on the X dimension for the RHS matrix - auto padding_info = get_padding_info({ src0, dst }); + auto padding_info = get_padding_info({src0, dst}); // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + if (_reinterpret_input_as_3d == _reinterpret_output_as_3d) { _reinterpret_input_as_3d = false; _reinterpret_output_as_3d = false; @@ -192,7 +214,8 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + auto win_config = + validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -212,8 +235,10 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com CLBuildOptions build_opts; build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1))); @@ -258,19 +283,19 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ElementsProcessed num_elements_processed{}; ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(), + dst->clone().get(), lhs_info, rhs_info, gemm_info, num_elements_processed) - .first); + .first); return Status{}; } @@ -280,11 +305,13 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -296,7 +323,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - if(_reinterpret_input_as_3d) + if (_reinterpret_input_as_3d) { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; @@ -304,10 +331,10 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); } - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); } @@ -317,7 +344,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -330,8 +357,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h index 4b328e0ab8..4f87096158 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -55,25 +56,34 @@ public: * rhs_info.k0: same as lhs_info.k0 * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; + bool _slide_matrix_b{true}; + bool _reinterpret_input_as_3d{false}; + bool _reinterpret_output_as_3d{false}; + bool _use_dummy_work_items{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp index 15493f7ddc..ddbc809cdd 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp @@ -29,13 +29,12 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -51,45 +50,55 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose); ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); const int m = gemm_info.m(); const int n = gemm_info.n(); const int k = gemm_info.k(); - TensorShape tensor_shape0{ src0->tensor_shape() }; + TensorShape tensor_shape0{src0->tensor_shape()}; tensor_shape0.set(0, k); tensor_shape0.set(1, m); - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info)); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info_reshaped0 = + src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info)); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); @@ -99,19 +108,24 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info, - ElementsProcessed &num_elements_processed) +std::pair validate_and_configure_window(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info, + ElementsProcessed &num_elements_processed) { unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -123,7 +137,8 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, // Configure kernel window num_elems_processed_per_iteration_x = rhs_info.n0; num_elems_processed_per_iteration_y = lhs_info.m0; - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS @@ -140,8 +155,13 @@ ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel() _type = CLKernelType::GEMM; } -void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); @@ -154,11 +174,12 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c const unsigned int num_dimensionssrc0 = src0->num_dimensions(); _slide_matrix_b = (src1->num_dimensions() >= num_dimensionssrc0); - auto padding_info = get_padding_info({ src0, src1, dst }); + auto padding_info = get_padding_info({src0, src1, dst}); ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + auto win_config = + validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -171,8 +192,10 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c // Create build options CLBuildOptions build_opts; build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(_reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); @@ -230,19 +253,19 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ElementsProcessed num_elements_processed{}; ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(), + dst->clone().get(), lhs_info, rhs_info, gemm_info, num_elements_processed) - .first); + .first); return Status{}; } @@ -252,11 +275,13 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -268,7 +293,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4; @@ -281,7 +306,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -295,8 +320,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h index a16f500f11..d7b785996f 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -64,25 +65,34 @@ public: * * @note lhs_info.k0 must be equal to rhs_info.k0 */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_output_as_3d{ false }; - unsigned int _k{ 1 }; - bool _use_dummy_work_items{ false }; + bool _slide_matrix_b{true}; + bool _reinterpret_output_as_3d{false}; + unsigned int _k{1}; + bool _use_dummy_work_items{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp index 5d552b8d63..2f1f3b8df0 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -29,14 +29,13 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/AccessWindowStatic.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -54,45 +53,57 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(src0->data_type() == DataType::QASYMM8) + if (src0->data_type() == DataType::QASYMM8) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); } else { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, + DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; - ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), + "Only 2,3,4,8,16 are supported for n0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); const int m = gemm_info.m; const int n = gemm_info.n; const int k = gemm_info.k; - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast(k)); - if(gemm_info.reinterpret_input_as_3d) + if (gemm_info.reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast(m)); } @@ -103,11 +114,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); - if(output_stage.type == GEMMLowpOutputStageType::NONE) + if (output_stage.type == GEMMLowpOutputStageType::NONE) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); } @@ -117,39 +128,42 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0)); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || + (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported"); // Checks performed if the dst stage needs to be fused - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // If a_offset == 0, vector_sum_col can be a nullptr - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]); } // If b_offset == 0, vector_sum_row can be a nullptr - if(gemm_info.b_offset != 0) + if (gemm_info.b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if mm result is a 3D reinterpretation - const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2])); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (expected_dst_shape[1] * expected_dst_shape[2])); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]); - if(expected_dst_shape.num_dimensions() > 1) + if (expected_dst_shape.num_dimensions() > 1) { const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -161,30 +175,32 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx], "vector_sum_row must have the same number of batches of dst tensor"); - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - if(output_multipliers != nullptr && output_shifts != nullptr) + if (output_multipliers != nullptr && output_shifts != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) + if (output_stage.is_quantized_per_channel) { ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0)); @@ -194,9 +210,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed) +std::pair validate_and_configure_window(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + ITensorInfo *bias, + ITensorInfo *output_multipliers, + ITensorInfo *output_shifts, + ElementsProcessed &num_elements_processed) { const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; @@ -211,16 +234,17 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) + if (reinterpret_input_as_3d == reinterpret_output_as_3d) { reinterpret_output_as_3d = false; } // dst tensor auto initialization if not yet initialized const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); - if(output_stage.type != GEMMLowpOutputStageType::NONE) + if (output_stage.type != GEMMLowpOutputStageType::NONE) { - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); } else { @@ -229,7 +253,7 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -242,12 +266,14 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0; num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0; - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = + calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x); window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access); @@ -255,17 +281,19 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, // No access window needed for vector_sum_row ARM_COMPUTE_UNUSED(vector_sum_row); - if(bias != nullptr) + if (bias != nullptr) { AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x); window_changed = window_changed || update_window_and_padding(win_out, bias_access); } - if(output_multipliers != nullptr && output_stage.is_quantized_per_channel) + if (output_multipliers != nullptr && output_stage.is_quantized_per_channel) { - AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x); + AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, + num_elems_processed_per_iteration_x); AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access); + window_changed = + window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access); } } @@ -275,7 +303,8 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); collapsed = win.collapse(win, dimension_to_collapse); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, collapsed); } } // namespace @@ -285,15 +314,22 @@ ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedO _type = CLKernelType::GEMM; } -void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts) +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + ITensorInfo *bias, + ITensorInfo *output_multipliers, + ITensorInfo *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts)); - auto padding_info = get_padding_info({ src0, src1, dst, vector_sum_row }); + auto padding_info = get_padding_info({src0, src1, dst, vector_sum_row}); const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; @@ -307,7 +343,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + if (_reinterpret_input_as_3d == _reinterpret_output_as_3d) { _reinterpret_input_as_3d = false; _reinterpret_output_as_3d = false; @@ -320,7 +356,8 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed); + auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -341,8 +378,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon CLBuildOptions build_opts; build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); @@ -361,12 +400,12 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_"); kernel_name += rhs_info.transpose ? "t" : "nt"; - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { kernel_name += "_fused_output_stage_fixedpoint"; _fuse_output_stage = true; // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0 && vector_sum_col != nullptr) + if (a_offset != 0 && vector_sum_col != nullptr) { build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); @@ -377,9 +416,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); // In case of _is_quantized_per_channel, RESULT_MULTIPLIER and RESULT_SHIFT are not utilized, but they are passed as a part of T_QUANTIZE8 macro. - if(!_is_quantized_per_channel) + if (!_is_quantized_per_channel) { - build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); + build_opts.add_option("-DRESULT_MULTIPLIER=" + + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); } else @@ -432,42 +472,56 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - dst->clone().get(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, - bias != nullptr ? bias->clone().get() : nullptr, - output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, - output_shifts != nullptr ? output_shifts->clone().get() : nullptr, - num_elements_processed) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info, + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + bias != nullptr ? bias->clone().get() : nullptr, + output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, + output_shifts != nullptr ? output_shifts->clone().get() : nullptr, + num_elements_processed) + .first); return Status{}; } -void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - const auto vector_sum_col = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); - const auto vector_sum_row = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); - const auto output_shifts = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); - const auto output_multipliers = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - if(src1->info()->num_dimensions() < 3) + const auto src0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto bias = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto vector_sum_col = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto output_shifts = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); + const auto output_multipliers = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -479,7 +533,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - if(_reinterpret_input_as_3d) + if (_reinterpret_input_as_3d) { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; @@ -487,10 +541,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); } - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); } @@ -515,7 +569,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -527,19 +581,19 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); - if(_reinterpret_input_as_3d) + if (_reinterpret_input_as_3d) { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor idx++; } - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor idx++; } - if(_fuse_output_stage) + if (_fuse_output_stage) { add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); @@ -548,8 +602,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); } enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h index a77604db7c..1d4696b089 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -70,31 +71,44 @@ public: * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). * Supported data types: S32. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr, - ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col = nullptr, + const ITensorInfo *vector_sum_row = nullptr, + ITensorInfo *bias = nullptr, + ITensorInfo *output_multipliers = nullptr, + ITensorInfo *output_shifts = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, - const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col = nullptr, + const ITensorInfo *vector_sum_row = nullptr, + const ITensorInfo *bias = nullptr, + const ITensorInfo *output_multipliers = nullptr, + const ITensorInfo *output_shifts = nullptr); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _is_quantized_per_channel{ false }; - bool _fuse_output_stage{ false }; + bool _slide_matrix_b{true}; + bool _reinterpret_input_as_3d{false}; + bool _reinterpret_output_as_3d{false}; + bool _use_dummy_work_items{false}; + bool _is_quantized_per_channel{false}; + bool _fuse_output_stage{false}; }; } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp index 792c71da76..030c11d069 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp @@ -23,16 +23,15 @@ */ #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" namespace arm_compute @@ -47,39 +46,51 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), + "The extension cl_arm_matrix_multiply is not supported on the target platform"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.k0 != 4 || lhs_info.k0 != 4, "Only 4 is supported as value for k0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4), "Only 1,2,4 are supported for m0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8), "Only 1,4,8 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4), + "Only 1,2,4 are supported for m0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8), + "Only 1,4,8 are supported for n0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); const int m = gemm_info.m; const int n = gemm_info.n; const int k = gemm_info.k; - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast(k)); - if(gemm_info.reinterpret_input_as_3d) + if (gemm_info.reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast(m)); } @@ -90,11 +101,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); - if(output_stage.type == GEMMLowpOutputStageType::NONE) + if (output_stage.type == GEMMLowpOutputStageType::NONE) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); } @@ -104,38 +115,41 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0)); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || + (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported"); // Checks performed if the dst stage needs to be fused - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // If a_offset == 0, vector_sum_col can be a nullptr - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]); } // If b_offset == 0, vector_sum_row can be a nullptr - if(gemm_info.b_offset != 0) + if (gemm_info.b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if mm result is a 3D reinterpretation - const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2])); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (expected_dst_shape[1] * expected_dst_shape[2])); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]); - if(expected_dst_shape.num_dimensions() > 1) + if (expected_dst_shape.num_dimensions() > 1) { const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -147,30 +161,32 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx], "vector_sum_row must have the same number of batches of dst tensor"); - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - if(output_multipliers != nullptr && output_shifts != nullptr) + if (output_multipliers != nullptr && output_shifts != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) + if (output_stage.is_quantized_per_channel) { ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0)); @@ -180,9 +196,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed) +std::pair validate_and_configure_window(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + ITensorInfo *bias, + ITensorInfo *output_multipliers, + ITensorInfo *output_shifts, + ElementsProcessed &num_elements_processed) { const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; @@ -200,9 +223,10 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, reinterpret_output_as_3d = false; // dst tensor auto initialization if not yet initialized const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); - if(output_stage.type != GEMMLowpOutputStageType::NONE) + if (output_stage.type != GEMMLowpOutputStageType::NONE) { - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); } else { @@ -211,7 +235,7 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -224,11 +248,12 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, num_elems_processed_per_iteration_x = 1; num_elems_processed_per_iteration_y = 1; - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x); window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access); @@ -236,17 +261,19 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, // No access window needed for vector_sum_row ARM_COMPUTE_UNUSED(vector_sum_row); - if(bias != nullptr) + if (bias != nullptr) { AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x); window_changed = window_changed || update_window_and_padding(win, bias_access); } - if(output_multipliers != nullptr && output_stage.is_quantized_per_channel) + if (output_multipliers != nullptr && output_stage.is_quantized_per_channel) { - AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x); + AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, + num_elems_processed_per_iteration_x); AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access); + window_changed = + window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access); } } @@ -278,7 +305,8 @@ std::pair validate_and_configure_window(const ITensorInfo *src0, collapsed.set(Window::DimX, x_dimension); collapsed.set(Window::DimY, y_dimension); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, collapsed); } } // namespace @@ -288,15 +316,22 @@ ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmLowpMatrixMultiplyResha _type = CLKernelType::GEMM; } -void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts) +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + ITensorInfo *bias, + ITensorInfo *output_multipliers, + ITensorInfo *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts)); - auto padding_info = get_padding_info({ src0, src1, dst, vector_sum_row }); + auto padding_info = get_padding_info({src0, src1, dst, vector_sum_row}); const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; @@ -313,7 +348,8 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed); + auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -334,18 +370,19 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0)); build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0)); build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0)); - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option("-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_mmul"); - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { build_opts.add_option("-DFUSED_OUTPUT_STAGE_FIXED_POINT"); _fuse_output_stage = true; // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0 && vector_sum_col != nullptr) + if (a_offset != 0 && vector_sum_col != nullptr) { build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); @@ -396,42 +433,54 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - dst->clone().get(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, - bias != nullptr ? bias->clone().get() : nullptr, - output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, - output_shifts != nullptr ? output_shifts->clone().get() : nullptr, - num_elements_processed) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info, + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + bias != nullptr ? bias->clone().get() : nullptr, + output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, + output_shifts != nullptr ? output_shifts->clone().get() : nullptr, + num_elements_processed) + .first); return Status{}; } -void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - const auto vector_sum_col = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); - const auto vector_sum_row = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + const auto vector_sum_col = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -449,7 +498,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tens add_3d_tensor_nhw_argument(idx, src1); // Bias buffer (_add_bias == true) - if(src2 != nullptr) + if (src2 != nullptr) { add_3d_tensor_nhw_argument(idx, src2); } @@ -461,21 +510,20 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tens _kernel.setArg(idx++, _n); _kernel.setArg(idx++, _k); - if(_fuse_output_stage) + if (_fuse_output_stage) { - if(vector_sum_col != nullptr) + if (vector_sum_col != nullptr) { add_3d_tensor_nhw_argument(idx, vector_sum_col); } - if(vector_sum_row != nullptr) + if (vector_sum_row != nullptr) { add_3d_tensor_nhw_argument(idx, vector_sum_row); } } enqueue(queue, *this, slice, cl::NDRange(32, 2), false); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h index 0ae549cd53..fc8b73140d 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/IClKernel.h" @@ -65,29 +66,42 @@ public: * @param[in] output_multipliers (Optional) Output multipliers tensor. Supported data types: S32. * @param[in] output_shifts (Optional) Output shifts tensor. Supported data types: S32. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr, - ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col = nullptr, + const ITensorInfo *vector_sum_row = nullptr, + ITensorInfo *bias = nullptr, + ITensorInfo *output_multipliers = nullptr, + ITensorInfo *output_shifts = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, - const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col = nullptr, + const ITensorInfo *vector_sum_row = nullptr, + const ITensorInfo *bias = nullptr, + const ITensorInfo *output_multipliers = nullptr, + const ITensorInfo *output_shifts = nullptr); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _fuse_output_stage{ false }; - signed int _m{ 1 }; - signed int _n{ 1 }; - signed int _k{ 1 }; + bool _fuse_output_stage{false}; + signed int _m{1}; + signed int _n{1}; + signed int _k{1}; }; } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp index 9ec0b5182f..d93dbde95a 100644 --- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp @@ -28,11 +28,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -44,12 +43,16 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t a_offset, int32_t b_offset) +Status validate_arguments(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + int32_t a_offset, + int32_t b_offset) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); @@ -57,26 +60,28 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); } // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) + if (b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (mm_result->dimension(1) * mm_result->dimension(2))); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) + if (output_shape.num_dimensions() > 1) { const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -87,13 +92,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], "mm_result tensor must have the same number of batches of output tensor"); - if(a_offset != 0) + if (a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } } @@ -108,29 +115,34 @@ ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel() } void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, - const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t k, int32_t a_offset, int32_t b_offset) + const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + int32_t k, + int32_t a_offset, + int32_t b_offset) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); - auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias }); + auto padding_info = get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias}); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->num_dimensions() > 1 - && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 && + mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0)); // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); @@ -138,8 +150,10 @@ void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compi // If b_offset == 0, vector_sum_row can be a nullptr build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); + build_opts.add_option_if(reinterpret_as_3d, + "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); + build_opts.add_option_if(reinterpret_as_3d, + "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); std::string kernel_name("gemmlowp_offset_contribution"); @@ -165,10 +179,15 @@ void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t a_offset, int32_t b_offset) +Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + int32_t a_offset, + int32_t b_offset) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); return Status{}; } @@ -177,10 +196,13 @@ void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Wind ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window); - const auto vector_sum_col = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); - const auto vector_sum_row = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - const auto mm_result = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_SRC_DST)); + const auto vector_sum_col = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto bias = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto mm_result = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_SRC_DST)); Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); @@ -209,8 +231,7 @@ void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Wind add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h index 48926e280b..2080a3a091 100644 --- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h @@ -67,15 +67,25 @@ public: * @param[in] b_offset Offset to be added to each element of the matrix B. */ void configure(const CLCompileContext &compile_context, - const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t k, int32_t a_offset, int32_t b_offset); + const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + int32_t k, + int32_t a_offset, + int32_t b_offset); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpOffsetContributionKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset); + static Status validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + int32_t a_offset, + int32_t b_offset); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp index c5fb54f524..26f479f61a 100644 --- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp @@ -34,7 +34,6 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -46,12 +45,20 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, - int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status validate_arguments(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *dst, + int32_t a_offset, + int32_t b_offset, + const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); @@ -62,33 +69,35 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) + if (output_stage.is_quantized_per_channel) { ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0)); } // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); } // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) + if (b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (mm_result->dimension(1) * mm_result->dimension(2))); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) + if (output_shape.num_dimensions() > 1) { const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -99,20 +108,22 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], "mm_result tensor must have the same number of batches of output tensor"); - if(a_offset != 0) + if (a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE); // Checks performed when output is configured - if((dst != nullptr) && (dst->total_size() != 0)) + if ((dst != nullptr) && (dst->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); @@ -120,7 +131,8 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), + "per channel quantization info is incorrect"); return Status{}; } @@ -131,16 +143,26 @@ ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutpu _type = CLKernelType::ELEMENTWISE; } -void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, - const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, - int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + ITensorInfo *dst, + int32_t k, + int32_t a_offset, + int32_t b_offset, + const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, + b_offset, output_stage, output_multipliers, output_shifts)); - auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts }); + auto padding_info = + get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts}); const int min = output_stage.gemmlowp_min_bound; const int max = output_stage.gemmlowp_max_bound; @@ -148,9 +170,8 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon _is_quantized_per_channel = output_stage.is_quantized_per_channel; // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->num_dimensions() > 1 - && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 && + mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Auto initialize the output auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type)); @@ -160,10 +181,11 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); @@ -171,8 +193,10 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon // If b_offset == 0, vector_sum_row can be a nullptr build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); + build_opts.add_option_if(reinterpret_as_3d, + "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); + build_opts.add_option_if(reinterpret_as_3d, + "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); @@ -210,26 +234,42 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *dst, + int32_t a_offset, + int32_t b_offset, + const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, + b_offset, output_stage, output_multipliers, output_shifts)); return Status{}; } -void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto mm_result = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - const auto vector_sum_col = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); - const auto vector_sum_row = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); - const auto output_shifts = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); - const auto output_multipliers = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto mm_result = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto vector_sum_col = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto output_shifts = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); + const auto output_multipliers = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); @@ -260,8 +300,7 @@ void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice); add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h index cee04473c4..97ee9bc97f 100644 --- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h @@ -66,23 +66,40 @@ public: * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). * Supported data types: S32 */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, - int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + ITensorInfo *dst, + int32_t k, + int32_t a_offset, + int32_t b_offset, + const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, - int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); + static Status validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *dst, + int32_t a_offset, + int32_t b_offset, + const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _is_quantized_per_channel{ false }; + bool _is_quantized_per_channel{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp index 39754385a1..7b7beab12c 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp @@ -27,15 +27,14 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -47,20 +46,23 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -75,7 +77,9 @@ ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32S _type = CLKernelType::ELEMENTWISE; } -Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, +Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -84,14 +88,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITenso return Status{}; } -void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, +void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); - auto padding_info = get_padding_info({ src, bias, dst }); + auto padding_info = get_padding_info({src, bias, dst}); // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); @@ -103,19 +110,26 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi auto max = info->gemmlowp_max_bound; CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset)); build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier)); build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift)); build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), - "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), - "-DMAX_BOUND=" + support::cpp11::to_string(max)); + build_opts.add_option_if( + (min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && + (min != max), + "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if( + (max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && + (min != max), + "-DMAX_BOUND=" + support::cpp11::to_string(max)); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); // Create kernel - const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint"; + const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) + ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" + : "gemmlowp_output_stage_quantize_down_fixedpoint"; // A macro guard to compile ONLY the kernel of interest build_opts.add_option("-D" + upper_string(kernel_name)); @@ -129,14 +143,18 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); // Create src window Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); @@ -144,7 +162,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &ten // Setup bias slice unsigned int idx1 = num_arguments_per_3D_tensor(); - if(bias != nullptr) + if (bias != nullptr) { Window biases_slice(slice); biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -158,8 +176,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &ten add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx1, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h index 69b5fc5018..71c9f4b752 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h @@ -60,14 +60,21 @@ public: * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16. * @param[in] info Output stage info. Used to pass the quantized output data type */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp index f379698326..52ebd32d46 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp @@ -27,15 +27,14 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -47,23 +46,31 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) - || info->gemmlowp_min_bound > info->gemmlowp_max_bound); + ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && + (info->output_data_type != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON( + info->gemmlowp_max_bound > + std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON( + info->gemmlowp_min_bound < + std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) || + info->gemmlowp_min_bound > info->gemmlowp_max_bound); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -78,7 +85,9 @@ ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleB _type = CLKernelType::ELEMENTWISE; } -Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, +Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -87,14 +96,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo return Status{}; } -void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, +void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); - auto padding_info = get_padding_info({ src, bias, dst }); + auto padding_info = get_padding_info({src, bias, dst}); // Output auto inizialitation if not yet initialized auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); @@ -107,7 +119,8 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier)); build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset)); build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); @@ -130,14 +143,18 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); // Create input window Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); @@ -145,7 +162,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, // Setup bias slice unsigned int idx1 = num_arguments_per_3D_tensor(); - if(bias != nullptr) + if (bias != nullptr) { Window biases_slice(slice); biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -159,8 +176,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx1, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h index 8eda24d25f..057c66767f 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h @@ -62,14 +62,21 @@ public: * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED * @param[in] info Output stage info. Used to pass the quantized output data type */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp index 5d54db214a..31434ce61b 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp @@ -26,15 +26,14 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -46,25 +45,34 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) - || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); + ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && + (output_stage->output_data_type != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON( + output_stage->gemmlowp_max_bound > + std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON( + output_stage->gemmlowp_min_bound < + std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) || + output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, + "Mismatching output data type"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } @@ -77,7 +85,10 @@ ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel() _type = CLKernelType::ELEMENTWISE; } -Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); @@ -85,14 +96,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, return Status{}; } -void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, +void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage)); - auto padding_info = get_padding_info({ src, bias, dst }); + auto padding_info = get_padding_info({src, bias, dst}); // Output auto inizialitation if not yet initialized auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); @@ -104,13 +118,18 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &c auto max = output_stage->gemmlowp_max_bound; CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset)); build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier)); build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift)); - build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), + build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type( + output_stage->output_data_type))) && + (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), + build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type( + output_stage->output_data_type))) && + (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max)); build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); @@ -135,15 +154,17 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); unsigned int idx1 = num_arguments_per_3D_tensor(); - if(bias != nullptr) + if (bias != nullptr) { Window biases_slice(slice); biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -157,8 +178,7 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx1, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h index 84c5060362..e6390801f1 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h @@ -62,14 +62,21 @@ public: * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED * @param[in] output_stage GEMMLowp output stage metadata. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -77,4 +84,4 @@ public: } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp index ea88b485a0..ee4a191fed 100644 --- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp @@ -32,7 +32,6 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -47,12 +46,15 @@ namespace Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->dimension(0) != src->dimension(1), + "Output vector must have length equal to the number of rows of the input matrix"); } return Status{}; } @@ -60,12 +62,15 @@ Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITens Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->dimension(0) != src->dimension(0), + "Output vector must have length equal to the number of columns of the input matrix"); } return Status{}; } @@ -76,7 +81,10 @@ IClGemmLowpReductionKernel::IClGemmLowpReductionKernel() _type = CLKernelType::ELEMENTWISE; } -void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mtx_a, + ITensorInfo *vector_sum_row, + const GEMMLowpReductionKernelInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); @@ -85,7 +93,7 @@ void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile // Output auto initialization if not yet initialized auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32); - auto padding_info = get_padding_info({ mtx_a, vector_sum_row }); + auto padding_info = get_padding_info({mtx_a, vector_sum_row}); // Set the arguments to pass at compile time CLBuildOptions build_opts; @@ -120,7 +128,9 @@ void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, + const ITensorInfo *vector_sum_row, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); @@ -133,8 +143,9 @@ void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); Window slice_in = collapsed.first_slice_window_2D(); @@ -151,11 +162,13 @@ void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window add_3D_tensor_argument(idx, src, slice_in); add_2D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_2D(slice_out)); + } while (collapsed.slide_window_slice_2D(slice_out)); } -void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mtx_b, + ITensorInfo *vector_sum_col, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); @@ -163,14 +176,15 @@ void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile // Output auto initialization if not yet initialized auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32); - auto padding_info = get_padding_info({ mtx_b, vector_sum_col }); + auto padding_info = get_padding_info({mtx_b, vector_sum_col}); const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0)); // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0))); build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1))); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type())); @@ -192,7 +206,9 @@ void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, + const ITensorInfo *vector_sum_col, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); @@ -205,8 +221,9 @@ void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY); @@ -222,8 +239,7 @@ void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window add_3D_tensor_argument(idx, src, slice_in); add_2D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_2D(slice_out)); + } while (collapsed.slide_window_slice_2D(slice_out)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h index 7119b5fee0..c81543e4c2 100644 --- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -52,7 +53,10 @@ public: * - scalar Scalar value to multiply each reduced column/row by. * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. */ - virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0; + virtual void configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const GEMMLowpReductionKernelInfo &info) = 0; }; /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. @@ -74,14 +78,18 @@ public: * - scalar Scalar value to multiply each reduced column/row by. * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; + void configure(const CLCompileContext &compile_context, + const ITensorInfo *mtx_a, + ITensorInfo *vector_sum_row, + const GEMMLowpReductionKernelInfo &info) override; /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); + static Status + validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -106,14 +114,18 @@ public: * - scalar Scalar value to multiply each reduced column/row by. * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; + void configure(const CLCompileContext &compile_context, + const ITensorInfo *mtx_b, + ITensorInfo *vector_sum_col, + const GEMMLowpReductionKernelInfo &info) override; /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); + static Status + validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp index b8997dfc7f..fd23aa9924 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp @@ -29,10 +29,11 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLUtils.h" #include "src/core/helpers/AutoConfiguration.h" @@ -51,7 +52,13 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) { @@ -59,15 +66,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) - && (!gemm_info.broadcast_bias), - "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) && + (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native"); @@ -82,7 +94,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k); - if(gemm_info.reinterpret_input_as_3d) + if (gemm_info.reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); } @@ -91,15 +103,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m); } - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + if (src2 != nullptr && !(helpers::float_ops::is_zero(beta))) { const unsigned int src2_dim0 = src2->dimension(0); const unsigned int src2_dim1 = src2->dimension(1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); - if(gemm_info.broadcast_bias) + if (gemm_info.broadcast_bias) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), + "Incorrect dimension of bias matrix which is to be broadcasted"); } else { @@ -107,9 +120,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + const TensorInfo tensor_info_dst = + dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); } @@ -117,9 +131,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, +std::pair validate_and_configure_window(ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) + const GEMMKernelInfo &gemm_info, + ElementsProcessed &num_elements_processed) { unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; @@ -132,17 +151,18 @@ std::pair validate_and_configure_window(ITensorInfo *src0, ITens // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) + if (reinterpret_input_as_3d == reinterpret_output_as_3d) { reinterpret_output_as_3d = false; } // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -155,34 +175,34 @@ std::pair validate_and_configure_window(ITensorInfo *src0, ITens num_elems_processed_per_iteration_x = rhs_info.n0; num_elems_processed_per_iteration_y = lhs_info.m0; - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = + calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowStatic src0_access(src0, 0, 0, - src0->dimension(0), - src0->dimension(1)); - AccessWindowStatic src1_access(src1, 0, 0, - ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), - src1->dimension(1)); - AccessWindowStatic dst_access(dst, 0, 0, - dst->dimension(0), - dst->dimension(1)); + AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1)); + AccessWindowStatic src1_access( + src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1)); + AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1)); - if(src2 != nullptr) + if (src2 != nullptr) { const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; - AccessWindowStatic src2_access(src2, 0, 0, - ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + AccessWindowStatic src2_access(src2, 0, 0, ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), src2->dimension(1)); - window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + window_changed = update_window_and_padding(win, src0_access, src1_access, + src2_access) || // window used by the execute_window_loop + update_window_and_padding( + win_out, dst_access); // window used to update the padding requirements of dst tensor } else { - window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + window_changed = + update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, + dst_access); // window used to update the padding requirements of dst tensor } // Collapse along the Z direction @@ -191,7 +211,8 @@ std::pair validate_and_configure_window(ITensorInfo *src0, ITens const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); collapsed = win.collapse(win, dimension_to_collapse); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, collapsed); } } // namespace @@ -201,19 +222,26 @@ ClGemmMatrixMultiplyNativeKernel::ClGemmMatrixMultiplyNativeKernel() _type = CLKernelType::GEMM; } -void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, +void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - auto padding_info = get_padding_info({ src0, dst }); + auto padding_info = get_padding_info({src0, dst}); _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); @@ -221,7 +249,7 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + if (_reinterpret_input_as_3d == _reinterpret_output_as_3d) { _reinterpret_input_as_3d = false; _reinterpret_output_as_3d = false; @@ -234,7 +262,8 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, + rhs_info, gemm_info, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); IClKernel::configure_internal(win_config.second); @@ -260,14 +289,17 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile // Create build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), + "-DALPHA=" + float_to_string_with_full_precision(alpha)); build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); @@ -275,9 +307,13 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); std::string kernel_name("gemm_mm_native"); @@ -314,21 +350,23 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, +Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ElementsProcessed num_elements_processed{}; ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(), src2 != nullptr ? src2->clone().get() : nullptr, - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, + dst->clone().get(), lhs_info, rhs_info, gemm_info, num_elements_processed) - .first); + .first); return Status{}; } @@ -338,15 +376,18 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -358,11 +399,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - if(_reinterpret_input_as_3d) + if (_reinterpret_input_as_3d) { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor unsigned int idx0; - if(_add_bias) + if (_add_bias) { idx0 = 4 * num_arguments_per_2D_tensor() + 7; } @@ -374,11 +415,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); } - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor unsigned int idx0; - if(_add_bias) + if (_add_bias) { idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0); } @@ -395,7 +436,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -403,7 +444,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window unsigned int idx = 0; add_2D_tensor_argument(idx, src0, slice); add_2D_tensor_argument(idx, src1, slice_b); - if(_add_bias) + if (_add_bias) { add_2D_tensor_argument(idx, src2, slice); } @@ -411,7 +452,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); - if(_add_bias) + if (_add_bias) { _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[2])); } @@ -423,8 +464,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window _kernel.setArg(idx++, _k); enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h index 80f8355932..da6c9a5bb7 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h @@ -25,6 +25,7 @@ #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -58,7 +59,13 @@ public: * rhs_info.k0: same of lhs_info.k0 * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); @@ -68,7 +75,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); @@ -76,14 +89,14 @@ public: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _add_bias{ false }; - signed int _m{ 1 }; - signed int _n{ 1 }; - signed int _k{ 1 }; + bool _slide_matrix_b{true}; + bool _reinterpret_input_as_3d{false}; + bool _reinterpret_output_as_3d{false}; + bool _use_dummy_work_items{false}; + bool _add_bias{false}; + signed int _m{1}; + signed int _n{1}; + signed int _k{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp index d72d29ea1e..4fe6bddb36 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp @@ -29,10 +29,11 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -52,7 +53,13 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) { @@ -61,42 +68,50 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) - && (!gemm_info.broadcast_bias), - "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), + "Only 2,3,4,8,16 are supported for m0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) && + (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), + "Mixed precision only supported for F16 data type"); ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); const unsigned int m = gemm_info.m; const unsigned int n = gemm_info.n; const unsigned int k = gemm_info.k; - TensorShape tensor_shape0{ src0->tensor_shape() }; + TensorShape tensor_shape0{src0->tensor_shape()}; tensor_shape0.set(0, k); tensor_shape0.set(1, m); - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + if (src2 != nullptr && !(helpers::float_ops::is_zero(beta))) { const unsigned int src2_dim0 = src2->dimension(0); const unsigned int src2_dim1 = src2->dimension(1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); - if(gemm_info.broadcast_bias) + if (gemm_info.broadcast_bias) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), + "Incorrect dimension of bias matrix which is to be broadcasted"); } else { @@ -107,15 +122,18 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info)); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info_reshaped0 = + src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info)); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + const TensorInfo tensor_info_dst = + dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); } @@ -123,9 +141,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, +std::pair validate_and_configure_window(ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) + const GEMMKernelInfo &gemm_info, + ElementsProcessed &num_elements_processed) { ARM_COMPUTE_UNUSED(src0, src1, src2); unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; @@ -134,7 +157,7 @@ std::pair validate_and_configure_window(ITensorInfo *src0, ITens TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -147,7 +170,8 @@ std::pair validate_and_configure_window(ITensorInfo *src0, ITens num_elems_processed_per_iteration_x = rhs_info.n0; num_elems_processed_per_iteration_y = lhs_info.m0; - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS @@ -164,18 +188,26 @@ ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel() _type = CLKernelType::GEMM; } -void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, - const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - auto padding_info = get_padding_info({ src0, src1, src2, dst }); + auto padding_info = get_padding_info({src0, src1, src2, dst}); _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); _add_bias = src2 != nullptr; @@ -188,14 +220,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - (src2 != nullptr) ? src2->clone().get() : nullptr, - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, - num_elements_processed); + auto win_config = validate_and_configure_window( + src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(), + lhs_info, rhs_info, gemm_info, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -213,12 +240,15 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi // Create build options CLBuildOptions build_opts; - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), + "-DALPHA=" + float_to_string_with_full_precision(alpha)); build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(_reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); @@ -229,7 +259,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type))); + build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision + ? get_cl_type_from_data_type(DataType::F32) + : get_cl_type_from_data_type(data_type))); build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); @@ -237,9 +269,13 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); std::string kernel_name("gemm_mm_reshaped_"); kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; @@ -287,9 +323,15 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, +Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); return Status{}; @@ -300,15 +342,18 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -324,12 +369,14 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind cl::Image2D src1_image2d; - if(_export_to_cl_image) + if (_export_to_cl_image) { - const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); + const TensorShape shape2d(src1->info()->dimension(0) / 4, + src1->info()->dimension(1) * src1->info()->dimension(2)); const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; - src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, + src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } do @@ -337,7 +384,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -348,7 +395,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind add_2D_tensor_argument(idx, src0, slice); // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) - if(_export_to_cl_image) + if (_export_to_cl_image) { _kernel.setArg(idx++, src1_image2d); } @@ -370,7 +417,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); // Bias stride_z (if _add_bias == true) - if(_add_bias) + if (_add_bias) { _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[2])); } @@ -379,7 +426,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); // Cross-plan padding (if _reinterpret_output_as_3d = true) - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { _kernel.setArg(idx++, static_cast(total_cross_plane_pad)); } @@ -393,8 +440,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind // Dispatch kernel enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h index 8d25412a40..30928c4e1d 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h @@ -24,12 +24,12 @@ #ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H +#include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" -#include "arm_compute/core/KernelDescriptors.h" - namespace arm_compute { namespace opencl @@ -83,16 +83,29 @@ public: * * @note lhs_info.k0 must be equal to rhs_info.k0 */ - void configure(const ClCompileContext &compile_context, - const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); @@ -100,14 +113,14 @@ public: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _add_bias{ false }; - bool _export_to_cl_image{ false }; - signed int _m{ 1 }; - signed int _n{ 1 }; - signed int _k{ 1 }; + bool _slide_matrix_b{true}; + bool _reinterpret_output_as_3d{false}; + bool _use_dummy_work_items{false}; + bool _add_bias{false}; + bool _export_to_cl_image{false}; + signed int _m{1}; + signed int _n{1}; + signed int _k{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp index b34c17cda8..1b19f1ec5b 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,24 +47,36 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0"); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) - && (!gemm_info.broadcast_bias), - "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) && + (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); @@ -71,19 +84,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons const unsigned int n = gemm_info.n; const unsigned int k = gemm_info.k; - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + if (src2 != nullptr && !(helpers::float_ops::is_zero(beta))) { const unsigned int src2_dim0 = src2->dimension(0); const unsigned int src2_dim1 = src2->dimension(1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0); - if(gemm_info.broadcast_bias) + if (gemm_info.broadcast_bias) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), + "Incorrect dimension of bias matrix which is to be broadcasted"); } else { @@ -93,10 +107,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); - if(gemm_info.reinterpret_input_as_3d) + if (gemm_info.reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); } @@ -106,9 +121,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + const TensorInfo tensor_info_dst = + dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); } @@ -116,8 +132,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) +Window validate_and_configure_window(ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info, + ElementsProcessed &num_elements_processed) { ARM_COMPUTE_UNUSED(src0, src1, src2); unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; @@ -128,14 +150,14 @@ Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITens // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. // This approach should only be used when the input/dst tensors have pad on the y direction - if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y) + if ((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y) { reinterpret_output_as_3d = false; } TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -148,7 +170,8 @@ Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITens num_elems_processed_per_iteration_x = rhs_info.n0; num_elems_processed_per_iteration_y = lhs_info.m0; - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS @@ -164,14 +187,22 @@ ClGemmMatrixMultiplyReshapedOnlyRhsKernel::ClGemmMatrixMultiplyReshapedOnlyRhsKe _type = CLKernelType::GEMM; } -void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, - const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); @@ -182,11 +213,11 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext _export_to_cl_image = rhs_info.export_to_cl_image; _has_pad_y = gemm_info.has_pad_y; - auto padding_info = get_padding_info({ src0, src1, src2, dst }); + auto padding_info = get_padding_info({src0, src1, src2, dst}); // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y) + if ((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y) { _reinterpret_input_as_3d = false; _reinterpret_output_as_3d = false; @@ -199,8 +230,9 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext ElementsProcessed num_elements_processed{}; // Configure kernel window - Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(), lhs_info, rhs_info, gemm_info, - num_elements_processed); + Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(), + (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(), + lhs_info, rhs_info, gemm_info, num_elements_processed); ICLKernel::configure_internal(win); // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true, @@ -225,7 +257,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext // Create build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), + "-DALPHA=" + float_to_string_with_full_precision(alpha)); build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); @@ -240,17 +273,23 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - if(_has_pad_y) + if (_has_pad_y) { build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); } - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); std::string kernel_name("gemm_mm_reshaped_only_rhs_"); kernel_name += rhs_info.transpose ? "t" : "nt"; @@ -294,28 +333,39 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, +Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); return Status{}; } -void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -341,12 +391,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con cl::Image2D src1_image2d; - if(_export_to_cl_image) + if (_export_to_cl_image) { - const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); + const TensorShape shape2d(src1->info()->dimension(0) / 4, + src1->info()->dimension(1) * src1->info()->dimension(2)); const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; - src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, + src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } do @@ -354,7 +406,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -365,7 +417,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con add_2D_tensor_argument(idx, src0, slice); // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) - if(_export_to_cl_image) + if (_export_to_cl_image) { _kernel.setArg(idx++, src1_image2d); } @@ -387,22 +439,23 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[rhs_idx_batch_size])); // Bias stride_z (if _add_bias == true) - if(_add_bias) + if (_add_bias) { - _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[bia_idx_batch_size])); + _kernel.setArg(idx++, + static_cast(src2->info()->strides_in_bytes()[bia_idx_batch_size])); } // dst stride_z _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[out_idx_batch_size])); // Cross-plan padding (if _reinterpret_input_as_3d = true) - if(_reinterpret_input_as_3d && _has_pad_y) + if (_reinterpret_input_as_3d && _has_pad_y) { _kernel.setArg(idx++, static_cast(total_cross_plane_pad_lhs)); } // Cross-plan padding (if reinterpret_output_as_3d = true) - if(_reinterpret_output_as_3d && _has_pad_y) + if (_reinterpret_output_as_3d && _has_pad_y) { _kernel.setArg(idx++, static_cast(total_cross_plane_pad_out)); } @@ -413,8 +466,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con _kernel.setArg(idx++, _k); enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h index 471160c94b..e8fd78d476 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h @@ -24,12 +24,12 @@ #ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H +#include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" -#include "arm_compute/core/KernelDescriptors.h" - namespace arm_compute { namespace opencl @@ -74,32 +74,46 @@ public: * rhs_info.transpose: true,false * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices */ - void configure(const ClCompileContext &compile_context, - const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _add_bias{ false }; - bool _export_to_cl_image{ false }; - bool _has_pad_y{ false }; - signed int _m{ 1 }; - signed int _n{ 1 }; - signed int _k{ 1 }; + bool _slide_matrix_b{true}; + bool _reinterpret_input_as_3d{false}; + bool _reinterpret_output_as_3d{false}; + bool _use_dummy_work_items{false}; + bool _add_bias{false}; + bool _export_to_cl_image{false}; + bool _has_pad_y{false}; + signed int _m{1}; + signed int _n{1}; + signed int _k{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp index 734f8f9b4c..9a2a4890f3 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp @@ -23,16 +23,17 @@ */ #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/CL/CLUtils.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -56,23 +57,36 @@ constexpr int mmul_m0 = 4; constexpr int mmul_n0 = 4; constexpr int mmul_k0 = 4; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), + "The extension cl_arm_matrix_multiply is not supported on the target platform"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1, "Only values greater than 0 are supported for m0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 && rhs_info.n0 != 8 && rhs_info.n0 != 16, "Only 1,2,3,4,8, and 16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 && + rhs_info.n0 != 8 && rhs_info.n0 != 16, + "Only 1,2,3,4,8, and 16 are supported for n0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 != 1 || lhs_info.k0 != 1), "Only 1 is supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.h0 != 4), "Only 4 is supported for h0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true, "Only true is supported for interleave with mmul extension enabled"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false, "Only false is supported for transpose with mmul extension enabled"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true, + "Only true is supported for interleave with mmul extension enabled"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false, + "Only false is supported for transpose with mmul extension enabled"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); @@ -87,7 +101,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); // Validate the reinterpreted-as-3D-case - if(gemm_info.depth_output_gemm3d != 0) + if (gemm_info.depth_output_gemm3d != 0) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); } @@ -97,9 +111,9 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } // Validate the gemm-batched case - if(src1->num_dimensions() > 2) + if (src1->num_dimensions() > 2) { - if(gemm_info.depth_output_gemm3d != 0) + if (gemm_info.depth_output_gemm3d != 0) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(3) != src1->dimension(2)); } @@ -109,15 +123,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + if (src2 != nullptr && !(helpers::float_ops::is_zero(beta))) { const unsigned int src2_dim0 = src2->dimension(0); const unsigned int src2_dim1 = src2->dimension(1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); - if(gemm_info.broadcast_bias) + if (gemm_info.broadcast_bias) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), + "Incorrect dimension of bias matrix which is to be broadcasted"); } else { @@ -125,18 +140,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + const TensorInfo tensor_info_dst = + dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); } @@ -144,7 +161,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, +std::pair validate_and_configure_window(ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) { @@ -152,11 +173,12 @@ std::pair validate_and_configure_window(ITensorInfo *src0, ITens bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -204,19 +226,26 @@ ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmMatrixMultiplyReshapedOnlyR _type = CLKernelType::GEMM; } -void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, +void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - auto padding_info = get_padding_info({ src0, src1, src2, dst }); + auto padding_info = get_padding_info({src0, src1, src2, dst}); _add_bias = src2 != nullptr; _export_to_cl_image = rhs_info.export_to_cl_image; @@ -236,7 +265,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon // Create build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), + "-DALPHA=" + float_to_string_with_full_precision(alpha)); build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); @@ -249,7 +279,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0)); build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0)); build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0)); - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option("-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); @@ -283,37 +314,44 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, +Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(), src2 != nullptr ? src2->clone().get() : nullptr, - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info) - .first); + dst->clone().get(), lhs_info, rhs_info, gemm_info) + .first); return Status{}; } -void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -321,12 +359,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, cl::Image2D src1_image2d; - if(_export_to_cl_image) + if (_export_to_cl_image) { - const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); + const TensorShape shape2d(src1->info()->dimension(0) / 4, + src1->info()->dimension(1) * src1->info()->dimension(2)); const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; - src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, + src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } Window slice = window.first_slice_window_3D(); @@ -336,14 +376,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, unsigned int idx = 0; add_3d_tensor_nhw_argument(idx, src0); - if(_export_to_cl_image) + if (_export_to_cl_image) { _kernel.setArg(idx++, src1_image2d); } add_3d_tensor_nhw_argument(idx, src1); // Bias buffer (_add_bias == true) - if(_add_bias) + if (_add_bias) { add_3d_tensor_nhw_argument(idx, src2); } @@ -358,8 +398,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core // LWS also enforces the order of execution of the workitems which improves cache utilization enqueue(queue, *this, slice, cl::NDRange(32, 2), false); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h index 59612fcf5d..86d3012f6e 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -59,7 +60,13 @@ public: * rhs_info.transpose: false * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); @@ -69,7 +76,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); @@ -77,11 +90,11 @@ public: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _add_bias{ false }; - bool _export_to_cl_image{ false }; - signed int _m{ 1 }; - signed int _n{ 1 }; - signed int _k{ 1 }; + bool _add_bias{false}; + bool _export_to_cl_image{false}; + signed int _m{1}; + signed int _n{1}; + signed int _k{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp index bf4b664b6e..eea2a169a3 100644 --- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,13 +47,17 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + bool reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); ARM_COMPUTE_RETURN_ERROR_ON((lhs_info.m0 > 4 && lhs_info.m0 < 8) && lhs_info.transpose); @@ -60,10 +65,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), + misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); } @@ -71,14 +77,15 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const return Status{}; } -Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +Window +configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) { const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0; const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0; TensorInfo tmp_info(*src); - if(reinterpret_input_as_3d) + if (reinterpret_input_as_3d) { // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -88,10 +95,12 @@ Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixI } // dst auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d))); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape( + *src, lhs_info, reinterpret_input_as_3d))); // Configure window - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS @@ -106,14 +115,18 @@ ClGemmReshapeLhsMatrixKernel::ClGemmReshapeLhsMatrixKernel() _type = CLKernelType::ELEMENTWISE; } -void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + bool reinterpret_input_as_3d) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d)); - auto padding_info = get_padding_info({ src }); + auto padding_info = get_padding_info({src}); const unsigned int src_w = src->dimension(0); const unsigned int m = reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1); @@ -168,7 +181,10 @@ void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_con ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + bool reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d)); return Status{}; @@ -179,8 +195,9 @@ void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -192,8 +209,7 @@ void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi add_3d_tensor_nhw_argument(idx, src); add_3d_tensor_nhw_argument(idx, dst); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h index db88e0d735..8e84e8ad8e 100644 --- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h +++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h @@ -57,14 +57,21 @@ public: * lhs_info.interleave: true, false * @param[in] reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d = false); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + bool reinterpret_src_as_3d = false); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + bool reinterpret_src_as_3d); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -72,4 +79,4 @@ public: } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp index b3a03880ed..b9ce3873c7 100644 --- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -52,8 +53,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), + "Only 1,2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose)); @@ -61,15 +64,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, src->data_type()); + const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, + src->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); } @@ -77,23 +82,27 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +std::pair +validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) { const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0; const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0; bool window_changed = false; // dst auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info))); + auto_init_if_empty( + *dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info))); // Configure window - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, + num_elems_processed_per_iteration_y); window_changed = update_window_and_padding(win, src_access); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { gemm::update_padding_for_cl_image(dst); } @@ -102,7 +111,8 @@ std::pair validate_and_configure_window(ITensorInfo *src, ITenso // This collapse needs to be here in order to tune the Z dimension of LWS Window collapsed = win.collapse(win, Window::DimZ); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, collapsed); } } // namespace @@ -112,7 +122,10 @@ ClGemmReshapeRhsMatrixKernel::ClGemmReshapeRhsMatrixKernel() _type = CLKernelType::ELEMENTWISE; } -void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const GEMMRHSMatrixInfo &rhs_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -143,7 +156,9 @@ void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_con _kernel.setArg(idx++, rhs_info.h0); } -Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMRHSMatrixInfo &rhs_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first); @@ -156,8 +171,9 @@ void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -169,9 +185,8 @@ void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi add_3d_tensor_nhw_argument(idx, src); add_3d_tensor_nhw_argument(idx, dst); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h index 31eaa46e02..7203d574fb 100644 --- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h +++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h @@ -66,7 +66,10 @@ public: * rhs_info.transpose: true, false * rhs_info.interleave: true, false */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const GEMMRHSMatrixInfo &rhs_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure() @@ -81,4 +84,4 @@ public: } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp index 719201d1fe..2e1cefc6e7 100644 --- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp +++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -52,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY)); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0)); - for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); } @@ -62,8 +62,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co } } // namespace -ClHeightConcatenateKernel::ClHeightConcatenateKernel() - : _height_offset(0) +ClHeightConcatenateKernel::ClHeightConcatenateKernel() : _height_offset(0) { _type = CLKernelType::ELEMENTWISE; } @@ -74,12 +73,15 @@ Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int return Status{}; } -void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst) +void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + unsigned int height_offset, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); _height_offset = height_offset; @@ -90,9 +92,10 @@ void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_contex build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -125,8 +128,9 @@ void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &windo ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); unsigned int idx = 0; add_4D_tensor_argument(idx, src, window); diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h index d3c077fc22..5a391a1212 100644 --- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h +++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h @@ -50,7 +50,8 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClHeightConcatenateKernel::configure() @@ -64,7 +65,7 @@ public: private: unsigned int _height_offset; - int32_t _depth{ 0 }; + int32_t _depth{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/gpu/cl/kernels/ClIm2ColKernel.cpp index e890847199..ef7a52828f 100644 --- a/src/gpu/cl/kernels/ClIm2ColKernel.cpp +++ b/src/gpu/cl/kernels/ClIm2ColKernel.cpp @@ -29,9 +29,10 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -60,13 +61,19 @@ struct Im2ColConfiguration bool is_padding_required_nchw{}; }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, - unsigned int num_groups) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups) { const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && has_bias); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); @@ -82,9 +89,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const const unsigned total_height = src->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom(); ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height)); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { - const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups)); + const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape( + compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); @@ -93,13 +101,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, - unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups) +std::pair validate_and_configure_window(ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_elems_processed_per_iteration, + bool is_padding_required_nchw, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output tensor auto initialization if not yet initialized - TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups); + TensorShape expected_output_shape = + compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(expected_output_shape)); @@ -113,22 +129,22 @@ std::pair validate_and_configure_window(ITensorInfo *src, ITenso bool window_changed = false; Window win; - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); } else { - if(is_padding_required_nchw) + if (is_padding_required_nchw) { - const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()); - win = calculate_max_window(*src, - Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second)); - AccessWindowStatic input_access(src, - -border.left, - -border.top, - ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration), - input_height + border.bottom); + const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), + conv_info.pad_left()); + win = calculate_max_window( + *src, Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second)); + AccessWindowStatic input_access( + src, -border.left, -border.top, + ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration), + input_height + border.bottom); window_changed = window_changed || update_window_and_padding(win, input_access); } else @@ -142,11 +158,17 @@ std::pair validate_and_configure_window(ITensorInfo *src, ITenso // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start()); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } -Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups) +Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups) { const DataLayout data_layout = src->data_layout(); const DataType data_type = src->data_type(); @@ -157,7 +179,8 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D const unsigned int input_height = src->dimension(height_idx); const unsigned int input_channel = src->dimension(channel_idx); - const std::pair convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); + const std::pair convolved_dims = + scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); // Im2Col configuration std::string kernel_name = "im2col_generic_"; @@ -184,21 +207,22 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups)); - build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0"); + build_opts.add_option_if_else(is_data_type_quantized(data_type), + "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0"); build_opts.add_option_if(has_bias, "-DHAS_BIAS"); - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { num_elems_processed_per_iteration = std::min(2U, input_channel); is_padding_required_nchw = false; // Only the 3x3 and 9x9 cases are optimized for NHWC - if(kernel_dims == Size2D(3U, 3U)) + if (kernel_dims == Size2D(3U, 3U)) { kernel_name = "im2col3x3_"; build_opts.add_option("-DIM2COL_3X3"); } - else if(kernel_dims == Size2D(9U, 9U)) + else if (kernel_dims == Size2D(9U, 9U)) { kernel_name = "im2col9x9_"; build_opts.add_option("-DIM2COL_9X9"); @@ -219,17 +243,17 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D } else { - if(dilation == Size2D(1U, 1U)) + if (dilation == Size2D(1U, 1U)) { const bool squared_im2col = kernel_dims.width == kernel_dims.height; - if(squared_im2col) + if (squared_im2col) { // Check if we can run an optimized im2col for NCHW - switch(kernel_dims.width) + switch (kernel_dims.width) { case 1: // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false - if(conv_info.stride().first == 1 && !conv_info.has_padding()) + if (conv_info.stride().first == 1 && !conv_info.has_padding()) { kernel_name = "im2col1x1_stridex1_"; num_elems_processed_per_iteration = 4; @@ -248,7 +272,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D break; case 11: // Optimized im2col11x11 if pad_x = pad_y = 0 - if(!conv_info.has_padding()) + if (!conv_info.has_padding()) { kernel_name = "im2col11x11_padx0_pady0_"; num_elems_processed_per_iteration = 1; @@ -262,7 +286,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D break; } } - else if(kernel_dims.width > 1 && !conv_info.has_padding()) + else if (kernel_dims.width > 1 && !conv_info.has_padding()) { kernel_name = "im2col_generic_padx0_pady0_"; num_elems_processed_per_iteration = 1; @@ -297,19 +321,29 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D } // namespace ClIm2ColKernel::ClIm2ColKernel() - : _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups() + : _data_layout(DataLayout::UNKNOWN), + _convolved_dims(), + _num_elems_processed_per_iteration(1), + _kernel_dims(), + _conv_info(), + _num_groups() { _type = CLKernelType::ELEMENTWISE; } -void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, - const Size2D &dilation, - unsigned int num_groups) +void ClIm2ColKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); _data_layout = src->data_layout(); const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); @@ -320,19 +354,22 @@ void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorI // Select and configure the optimal OpenCL kernel to run. // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration // and the padding requirement flag - Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); + Im2ColConfiguration im2col_config = + configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); // Create kernel _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options); - _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); + _convolved_dims = + scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration; _kernel_dims = kernel_dims; // Only needed by the Tuner _conv_info = conv_info; // Only needed by the Tuner _num_groups = num_groups; // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, + auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation, + im2col_config.num_elems_processed_per_iteration, im2col_config.is_padding_required_nchw, num_groups); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); IClKernel::configure_internal(win_config.second); @@ -353,14 +390,22 @@ void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorI ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info)); } -Status ClIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, - unsigned int num_groups) +Status ClIm2ColKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); - Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, + Im2ColConfiguration im2col_config = + configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims, + conv_info, has_bias, dilation, + im2col_config.num_elems_processed_per_iteration, im2col_config.is_padding_required_nchw, num_groups) - .first); + .first); return Status{}; } @@ -388,7 +433,7 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm Window slice_in = first_slice_3d; Window slice_out = window_output.first_slice_window_2D(); - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3); const int num_batches = tmp_win[3].end(); @@ -398,7 +443,10 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm } else { - slice.set(0, Window::Dimension(0, static_cast(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration)); + slice.set(0, + Window::Dimension( + 0, static_cast(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), + _num_elems_processed_per_iteration)); slice.set(1, Window::Dimension(0, static_cast(_convolved_dims.second), 1)); // Note: In case of NCHW the 3rd dimension is already set collapsing the input window } @@ -414,14 +462,16 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor()); + unsigned int idx = num_arguments_per_3D_tensor() + + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor()); _kernel.setArg(idx++, static_cast(src->info()->strides_in_bytes()[3])); - _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)])); + _kernel.setArg(idx++, + static_cast(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)])); do { unsigned int idx = 0; add_3D_tensor_argument(idx, src, slice_in); - if(_num_groups == 1) + if (_num_groups == 1) { add_2D_tensor_argument(idx, dst, slice_out); } @@ -430,8 +480,8 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm add_3D_tensor_argument(idx, dst, slice_out); } enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in)); + } while (window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && + window_collapsed.slide_window_slice_3D(slice_in)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.h b/src/gpu/cl/kernels/ClIm2ColKernel.h index a637ad215d..c8cd5b328d 100644 --- a/src/gpu/cl/kernels/ClIm2ColKernel.h +++ b/src/gpu/cl/kernels/ClIm2ColKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Size2D.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -77,28 +78,38 @@ public: * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, - const Size2D &dilation = Size2D(1U, 1U), - unsigned int num_groups = 1); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration * * Similar to ClIm2ColKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U), - unsigned int num_groups = 1); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; public: - DataLayout _data_layout; + DataLayout _data_layout; std::pair _convolved_dims; - unsigned int _num_elems_processed_per_iteration; - Size2D _kernel_dims; - PadStrideInfo _conv_info; - unsigned int _num_groups; + unsigned int _num_elems_processed_per_iteration; + Size2D _kernel_dims; + PadStrideInfo _conv_info; + unsigned int _num_groups; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp index d291fad76c..8c493d08c6 100644 --- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp +++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,26 +44,29 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, + "M0 can only be greater than 0 and less than or equal to 8"); // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), - src->data_layout(), - weights->tensor_shape(), - conv_info, - desc)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), + misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(), + weights->tensor_shape(), conv_info, desc)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); } @@ -75,8 +79,12 @@ ClIndirectConv2dAddressPrecalculationKernel::ClIndirectConv2dAddressPrecalculati _type = CLKernelType::ELEMENTWISE; } -void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, - const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc) +void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info, desc)); @@ -85,11 +93,8 @@ void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileConte constexpr unsigned int height_idx = 2; // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), - src->data_layout(), - weights->tensor_shape(), - conv_info, - desc); + TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape( + src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc); TensorShape output_conv_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); @@ -136,14 +141,19 @@ void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileConte // Since this kernel should be called only once, we do not need to set the config_id for tuning } -Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc) +Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info, desc)); return Status{}; } -void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h index ff7f4be147..b565609c6a 100644 --- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h +++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h @@ -60,16 +60,23 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] desc Direct convolution descriptor used to build the NHWC direct/indirect convolution kernel. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, - const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const DirectConvComputeKernelInfo &desc); /** Static function to check if given info will lead to a valid configuration * * Similar to ClIndirectConv2dAddressPreCalculationKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const DirectConvComputeKernelInfo &desc); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp index a337eb50fd..3510b6970c 100644 --- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp @@ -23,13 +23,14 @@ */ #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,8 +47,14 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *indirect_buffer, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); @@ -55,37 +62,38 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indirect_buffer, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(indirect_buffer->tensor_shape(), - misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), - src->data_layout(), - weights->tensor_shape(), - conv_info, - desc)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + indirect_buffer->tensor_shape(), + misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(), + weights->tensor_shape(), conv_info, desc)); constexpr int channel_idx = 0; constexpr int batch_idx = 3; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, + "M0 can only be greater than 0 and less than or equal to 8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && + desc.n0 != 16, "N0 can only be: 1, 2, 3, 4, 8, and 16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && + desc.k0 != 16, "K0 can only be: 1, 2, 3, 4, 8, and 16"); - if(desc.export_weights_to_cl_image) + if (desc.export_weights_to_cl_image) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, - "K0 can only be: 4, 8, and 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, "K0 can only be: 4, 8, and 16"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(weights), "Export to CLImage is not supported for this weight configuration"); } - if(biases != nullptr) + if (biases != nullptr) { - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -95,15 +103,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(channel_idx) != weights->dimension(batch_idx), "Biases size and number of dst feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, - "Biases should be one dimensional"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional"); } // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); } @@ -116,13 +123,21 @@ ClIndirectConv2dKernel::ClIndirectConv2dKernel() _type = CLKernelType::DIRECT; } -void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *indirect_buffer, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *indirect_buffer, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, indirect_buffer, dst); // Perform validation - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc)); constexpr unsigned int channel_idx = 0; constexpr unsigned int width_idx = 1; @@ -137,10 +152,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, output_shape, - 1, - src->data_type(), - src->quantization_info()); + auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); // Configure kernel window Window win; @@ -164,7 +176,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, _export_to_cl_image = desc.export_weights_to_cl_image; // Update the padding for the weights tensor if we can export to cl_image - if(_export_to_cl_image) + if (_export_to_cl_image) { gemm::update_padding_for_cl_image(weights); } @@ -173,11 +185,12 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, // When M0 is 5, 6, and 7, we use vload8 to fetch the data from the buffer const unsigned int load_indirect_buf_size = m0 > 4 ? 8 : m0; const unsigned int indirect_buf_width = indirect_buffer->tensor_shape()[0]; - const unsigned int round_up_width = ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size; - const unsigned int padding = round_up_width - indirect_buf_width; + const unsigned int round_up_width = + ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size; + const unsigned int padding = round_up_width - indirect_buf_width; indirect_buffer->extend_padding(PaddingSize(0, indirect_buffer->padding().right + padding, 0, 0)); - if(biases != nullptr) + if (biases != nullptr) { build_options.add_option(std::string("-DHAS_BIAS")); build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type()))); @@ -186,9 +199,10 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324 const auto act_function = act_info.activation(); - if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (data_type == DataType::F32 || data_type == DataType::F16)) + if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || + act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && + (data_type == DataType::F32 || data_type == DataType::F16)) { // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations @@ -224,7 +238,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, // A macro guard to compile ONLY the kernel of interest build_options.add_option("-D" + upper_string(kernel_name.str())); - if(compile_context.get_ddk_version() >= 30) + if (compile_context.get_ddk_version() >= 30) { build_options.add_option("-fregister-allocation=64"); } @@ -253,10 +267,17 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, _config_id += support::cpp11::to_string(dst->dimension(channel_idx)); } -Status ClIndirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +Status ClIndirectConv2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *indirect_buffer, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc)); return Status{}; } @@ -268,35 +289,42 @@ void ClIndirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, // Get initial windows Window slice = window.first_slice_window_3D(); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto weights = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto biases = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - const auto indirect_buffer = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_3)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto weights = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto biases = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + const auto indirect_buffer = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_3)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); cl::Image2D weights_cl_image; - if(_export_to_cl_image) + if (_export_to_cl_image) { - const size_t image_w = weights->info()->dimension(0) / 4; - const size_t image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3); + const size_t image_w = weights->info()->dimension(0) / 4; + const size_t image_h = + weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3); const TensorShape shape2d(image_w, image_h); const size_t image_row_pitch = weights->info()->strides_in_bytes()[1]; // Export cl_buffer to cl_image - weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + weights_cl_image = + create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, + weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } unsigned int idx = 0; add_4d_tensor_nhwc_argument(idx, src); add_4d_tensor_nhwc_argument(idx, indirect_buffer); add_4d_tensor_nhwc_argument(idx, dst); - if(_export_to_cl_image) + if (_export_to_cl_image) { _kernel.setArg(idx++, weights_cl_image); } add_4d_tensor_nhwc_argument(idx, weights); - if(biases != nullptr) + if (biases != nullptr) { add_1D_tensor_argument(idx, biases, slice); } diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h index b6c7b35fa4..04166d417e 100644 --- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h +++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -60,22 +61,35 @@ public: * @param[in] act_info Contains activaton information described in @ref ActivationLayerInfo. * @param[in] desc Direct convolution descriptor used to build the NHWC indirect convolution kernel. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *off, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *off, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc); /** Static function to check if given info will lead to a valid configuration * * Similar to ClIndirectConv2dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *off, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc); + static Status validate(const ITensorInfo *src, + const ITensorInfo *off, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; public: - bool _export_to_cl_image{ false }; + bool _export_to_cl_image{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp index 66331bc818..0bb6b0c083 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp @@ -29,17 +29,16 @@ #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/utils/StringUtils.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -62,51 +61,62 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) // Validate M0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0"); - if(adj_lhs) + if (adj_lhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), + "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); } // Validate N0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), + "Only 1,2,3,4,8,16 are supported for N0"); // Validate K0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0"); - if(!adj_lhs || adj_rhs) + if (!adj_lhs || adj_rhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), + "Only 1,2,3,4,8,16 are supported for K0"); } return Status{}; } -} +} // namespace ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel() { _type = CLKernelType::GEMM; } -Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, +Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY && act_info.activation() != ActivationFunction::RELU - && act_info.activation() != ActivationFunction::LU_BOUNDED_RELU && act_info.activation() != ActivationFunction::BOUNDED_RELU), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY && + act_info.activation() != ActivationFunction::RELU && + act_info.activation() != ActivationFunction::LU_BOUNDED_RELU && + act_info.activation() != ActivationFunction::BOUNDED_RELU), "Activation Function specified is unsupported."); - const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); + const TensorShape expected_output_shape = + misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); @@ -115,7 +125,12 @@ Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorI return Status{}; } -void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, +void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info); @@ -123,7 +138,8 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info)); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape( + lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); const int m = dst->dimension(1); const int n = dst->dimension(0); @@ -217,10 +233,13 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const ICLTensor *lhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const ICLTensor *rhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const ICLTensor *bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - ICLTensor *dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const ICLTensor *lhs = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const ICLTensor *rhs = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const ICLTensor *bias = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + ICLTensor *dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst); @@ -229,7 +248,7 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window add_3d_tensor_nhw_argument(idx, lhs); add_3d_tensor_nhw_argument(idx, rhs); - if(bias != nullptr) + if (bias != nullptr) { add_3d_tensor_nhw_argument(idx, bias); } diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h index 64415f42f7..ffdb720855 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h @@ -25,6 +25,7 @@ #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -54,7 +55,12 @@ public: * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel * @param[in] act_info (Optional) Class containing information about fused activation function. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + void configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -62,7 +68,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp index 464212d7db..94e3c4e47b 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp @@ -28,10 +28,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/utils/StringUtils.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" @@ -64,13 +64,15 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) // Validate M0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0"); - if(adj_lhs) + if (adj_lhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), + "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); } // Validate N0 - ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), "Only 1,2,3,4,8,16 are supported for N0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), + "Only 1,2,3,4,8,16 are supported for N0"); // Validate K0 ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 4), "Only 4 is supported for k0"); @@ -84,7 +86,11 @@ ClMatMulLowpNativeMMULKernel::ClMatMulLowpNativeMMULKernel() _type = CLKernelType::GEMM; } -Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, +Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); @@ -102,16 +108,17 @@ Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITen ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY), "Activation Function specified is unsupported."); - const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info); + const TensorShape expected_output_shape = + misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); @@ -121,15 +128,21 @@ Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITen return Status{}; } -void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, - const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) +void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info, act_info); ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info)); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape( + lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); ARM_COMPUTE_UNUSED(compile_context, lhs, rhs, bias, matmul_kernel_info, act_info); CLBuildOptions build_opts; @@ -147,7 +160,8 @@ void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_con const unsigned int n0_leftover = n % n0; // Configure kernel window - const auto win_config = validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0); + const auto win_config = + validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); IClKernel::configure_internal(win_config.second); @@ -215,10 +229,13 @@ void ClMatMulLowpNativeMMULKernel::run_op(ITensorPack &tensors, const Window &wi ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto *lhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto *rhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto *bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present - auto *dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto *lhs = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto *rhs = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto *bias = utils::cast::polymorphic_downcast( + tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present + auto *dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst); @@ -227,7 +244,7 @@ void ClMatMulLowpNativeMMULKernel::run_op(ITensorPack &tensors, const Window &wi add_3d_tensor_nhw_argument(idx, lhs); add_3d_tensor_nhw_argument(idx, rhs); - if(bias != nullptr) + if (bias != nullptr) { add_3d_tensor_nhw_argument(idx, bias); } diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h index d2aa40b2e2..6c56f15d74 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h @@ -25,6 +25,7 @@ #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -49,7 +50,12 @@ public: * * @return a status */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + void configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -57,7 +63,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp index 41ba5d5e25..a1fa9fa9ab 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp @@ -28,9 +28,9 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/StringUtils.h" #include "src/common/utils/Log.h" #include "src/core/CL/CLUtils.h" @@ -38,7 +38,6 @@ #include "src/core/helpers/WindowHelpers.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -61,20 +60,23 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) // Validate M0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0"); - if(adj_lhs) + if (adj_lhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), + "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); } // Validate N0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), + "Only 1,2,3,4,8,16 are supported for N0"); // Validate K0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0"); - if(!adj_lhs || adj_rhs) + if (!adj_lhs || adj_rhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), + "Only 1,2,3,4,8,16 are supported for K0"); } return Status{}; @@ -83,30 +85,37 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInfo &matmul_kernel_info) { ARM_COMPUTE_RETURN_ERROR_ON(matmul_kernel_info.export_rhs_to_cl_image && rhs->lock_paddings()); - if(matmul_kernel_info.export_rhs_to_cl_image) + if (matmul_kernel_info.export_rhs_to_cl_image) { - if(matmul_kernel_info.adj_rhs) + if (matmul_kernel_info.adj_rhs) { const int k0 = matmul_kernel_info.k0; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16, "K0 can only be: 4, 8, and 16 for Rhs transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16, + "K0 can only be: 4, 8, and 16 for Rhs transposed"); } else { const int n0 = matmul_kernel_info.n0; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16, "N0 can only be: 4, 8, and 16 for Rhs non-transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16, + "N0 can only be: 4, 8, and 16 for Rhs non-transposed"); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), + "Export to CLImage is not supported for this device/configuration"); } return Status{}; } -} +} // namespace ClMatMulNativeKernel::ClMatMulNativeKernel() { _type = CLKernelType::GEMM; } -Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, +Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); @@ -114,28 +123,36 @@ Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info)); - const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); + const TensorShape expected_output_shape = + misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, lhs); ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], + "First dimension of bias and output tensors must match."); } return Status{}; } -void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, +void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info); @@ -143,7 +160,8 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info)); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape( + lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); const int m = dst->dimension(1); const int n = dst->dimension(0); @@ -187,7 +205,7 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT // A macro guard to compile ONLY the kernel of interest build_opts.add_option("-D" + upper_string(kernel_name)); - if(_export_rhs_to_cl_image) + if (_export_rhs_to_cl_image) { gemm::update_padding_for_cl_image(rhs); } @@ -222,10 +240,13 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const ICLTensor *lhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const ICLTensor *rhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const ICLTensor *bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present - ICLTensor *dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const ICLTensor *lhs = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const ICLTensor *rhs = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const ICLTensor *bias = utils::cast::polymorphic_downcast( + tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present + ICLTensor *dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst); @@ -235,7 +256,7 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl add_3d_tensor_nhw_argument(idx, lhs); cl::Image2D rhs_cl_image; - if(_export_rhs_to_cl_image) + if (_export_rhs_to_cl_image) { const size_t image_w = rhs->info()->dimension(0) / 4; const size_t image_h = rhs->info()->tensor_shape().total_size() / rhs->info()->dimension(0); @@ -243,12 +264,13 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl const size_t image_row_pitch = rhs->info()->strides_in_bytes()[1]; // Export cl_buffer to cl_image - rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d, rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d, + rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); _kernel.setArg(idx++, rhs_cl_image); } add_3d_tensor_nhw_argument(idx, rhs); - if(bias != nullptr) + if (bias != nullptr) { add_3d_tensor_nhw_argument(idx, bias); } diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h index fe2b787c12..2cb150bc8f 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h +++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h @@ -25,6 +25,7 @@ #define ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -52,7 +53,12 @@ public: * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel * @param[in] act_info (Optional) Specifies activation function to use after Matrix multiplication. Default is Identity function. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + void configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -60,14 +66,18 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _export_rhs_to_cl_image{ false }; + bool _export_rhs_to_cl_image{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp index 432270e8bf..76bf846e74 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp @@ -28,14 +28,13 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/StringUtils.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -62,31 +61,38 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) // Validate M0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0"); - if(adj_lhs) + if (adj_lhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), + "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); } // Validate N0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), "Only 1,2,3,4,8,16 are supported for N0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), + "Only 1,2,3,4,8,16 are supported for N0"); // Validate K0 ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 1), "Only 1 is supported for k0"); return Status{}; } -} +} // namespace ClMatMulNativeMMULKernel::ClMatMulNativeMMULKernel() { _type = CLKernelType::GEMM; } -Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info) +Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), + "The extension cl_arm_matrix_multiply is not supported on the target platform"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info)); @@ -96,32 +102,40 @@ Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorI const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x(); ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0); - const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info); + const TensorShape expected_output_shape = + misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], + "First dimension of bias and output tensors must match."); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, bias); } return Status{}; } -void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info) +void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info); ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info)); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape( + lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); const int m = dst->dimension(1); const int n = dst->dimension(0); @@ -135,7 +149,8 @@ void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context const int n0 = adjust_vec_size(matmul_kernel_info.n0, n); // Configure kernel window - const auto win_config = validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0); + const auto win_config = + validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); IClKernel::configure_internal(win_config.second); @@ -186,17 +201,20 @@ void ClMatMulNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const ICLTensor *lhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const ICLTensor *rhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const ICLTensor *bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present - ICLTensor *dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const ICLTensor *lhs = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const ICLTensor *rhs = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const ICLTensor *bias = utils::cast::polymorphic_downcast( + tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present + ICLTensor *dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst); unsigned int idx = 0; add_3d_tensor_nhw_argument(idx, lhs); add_3d_tensor_nhw_argument(idx, rhs); - if(bias != nullptr) + if (bias != nullptr) { add_3d_tensor_nhw_argument(idx, bias); } diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h index 80448974c4..1aeb896325 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h +++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h @@ -72,22 +72,31 @@ public: * @param[out] dst Output tensor info. * @param[in] matmul_info Attributes for Batch MatMul Kernel */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMatMulNativeMMULKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info); + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - int _m{ 1 }; - int _n{ 1 }; - int _k{ 1 }; + int _m{1}; + int _n{1}; + int _k{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClMulKernel.cpp b/src/gpu/cl/kernels/ClMulKernel.cpp index 5ca0639852..3b59c2a7fc 100644 --- a/src/gpu/cl/kernels/ClMulKernel.cpp +++ b/src/gpu/cl/kernels/ClMulKernel.cpp @@ -23,15 +23,16 @@ */ #include "src/gpu/cl/kernels/ClMulKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -46,24 +47,25 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status validate_arguments(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(overflow_policy); ARM_COMPUTE_UNUSED(rounding_policy); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32, - DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::F16, DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); @@ -76,27 +78,35 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8), + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && + (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8), "Dst can only be U8 if both src are U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8), - "Dst can only be QASYMM8 if both src are QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED), - "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16), - "Dst can only be QSYMM16 if both src are QSYMM16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->data_type() == DataType::QASYMM8 && + (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8), + "Dst can only be QASYMM8 if both src are QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->data_type() == DataType::QASYMM8_SIGNED && + (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED), + "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->data_type() == DataType::QSYMM16 && + (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16), + "Dst can only be QSYMM16 if both src are QSYMM16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && + (dst->data_type() != DataType::S32), "Dst must be S32 if source tensors are S32"); - if(in_place) + if (in_place) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0), - "Wrong shape for dst, cannot do in_place calculation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, + src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0), + "Wrong shape for dst, cannot do in_place calculation"); } else { @@ -114,14 +124,19 @@ ClMulKernel::ClMulKernel() _type = CLKernelType::ELEMENTWISE; } -void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void ClMulKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, - scale, overflow_policy, rounding_policy, act_info)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info)); - auto padding_info = get_padding_info({ src1, src2, dst }); + auto padding_info = get_padding_info({src1, src2, dst}); const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); @@ -133,7 +148,7 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 // Moreover, it will be negative as we deal with 1/2^n - if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) + if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) { // Store the positive exponent. We know that we compute 1/2^n // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 @@ -142,19 +157,19 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo std::string acc_type; // Check if it has float src and dst - if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type())) + if (is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type())) { scale_int = -1; acc_type = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half"; } else { - if(src1->element_size() == 4 || src2->element_size() == 4) + if (src1->element_size() == 4 || src2->element_size() == 4) { // use 64 bit accumulator for 32-bit input acc_type = "long"; } - else if(src1->element_size() == 2 || src2->element_size() == 2) + else if (src1->element_size() == 2 || src2->element_size() == 2) { // Use 32-bit accumulator for 16-bit input acc_type = "int"; @@ -176,11 +191,15 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type())); build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type())); build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); - build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); + build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) + ? "1" + : support::cpp11::to_string(vec_size))); + build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) + ? "1" + : support::cpp11::to_string(vec_size))); build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size)); build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); - if(is_quantized && (dst->data_type() != DataType::S32)) + if (is_quantized && (dst->data_type() != DataType::S32)) { const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); @@ -200,12 +219,14 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo else { kernel_name += (scale_int >= 0) ? "_int" : "_float"; - build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE"); + build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), + "-DWRAP", "-DSATURATE"); build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte"); build_opts.add_option("-DACC_DATA_TYPE=" + acc_type); - if(act_info.enabled()) + if (act_info.enabled()) { - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); + build_opts.add_option("-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(act_info.activation()))); build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); } @@ -223,7 +244,7 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo // Set scale argument unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters - if(scale_int >= 0 && !is_quantized) + if (scale_int >= 0 && !is_quantized) { _kernel.setArg(idx++, scale_int); } @@ -261,8 +282,13 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo _config_id += support::cpp11::to_string(dst->dimension(2)); } -Status ClMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status ClMulKernel::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info)); @@ -275,9 +301,11 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src_0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src_0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src_1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst); @@ -286,17 +314,18 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command const TensorShape &out_shape = dst->info()->tensor_shape(); bool can_collapse = true; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) { can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) { can_collapse = (in_shape1[d] == in_shape2[d]); } } bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; @@ -312,7 +341,7 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command unsigned int idx = 0; add_3D_tensor_argument(idx, src_0, slice_input1); add_3D_tensor_argument(idx, src_1, slice_input2); - if(!in_place) + if (!in_place) { add_3D_tensor_argument(idx, dst, slice); } @@ -320,15 +349,17 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } namespace { constexpr unsigned int vec_size_complex = 1; -Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status validate_arguments_complex(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32); @@ -340,11 +371,12 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst"); } return Status{}; @@ -356,19 +388,23 @@ ClComplexMulKernel::ClComplexMulKernel() _type = CLKernelType::ELEMENTWISE; } -void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClComplexMulKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info)); - auto padding_info = get_padding_info({ src1, src2, dst }); + auto padding_info = get_padding_info({src1, src2, dst}); const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - if(act_info.enabled()) + if (act_info.enabled()) { build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); @@ -384,7 +420,10 @@ void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITen ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClComplexMulKernel::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info)); @@ -397,26 +436,29 @@ void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl:: ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src_0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src_0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src_1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); const TensorShape &in_shape1 = src_0->info()->tensor_shape(); const TensorShape &in_shape2 = src_1->info()->tensor_shape(); const TensorShape &out_shape = dst->info()->tensor_shape(); bool can_collapse = true; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) { can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) { can_collapse = (in_shape1[d] == in_shape2[d]); } } bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; @@ -435,8 +477,7 @@ void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl:: ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClMulKernel.h b/src/gpu/cl/kernels/ClMulKernel.h index 4e62a6d67a..76a3ce02c1 100644 --- a/src/gpu/cl/kernels/ClMulKernel.h +++ b/src/gpu/cl/kernels/ClMulKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_MUL_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -72,16 +73,27 @@ public: * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMulKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -101,14 +113,21 @@ public: * @param[out] dst The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClComplexMulKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClPermuteKernel.cpp b/src/gpu/cl/kernels/ClPermuteKernel.cpp index 8d4655114b..a4755782ed 100644 --- a/src/gpu/cl/kernels/ClPermuteKernel.cpp +++ b/src/gpu/cl/kernels/ClPermuteKernel.cpp @@ -29,8 +29,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -60,13 +61,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const "Permutation up to 4-D src tensor is supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4, "Permutation vector size should be less than or equal to 4"); - for(const auto &p : perm) + for (const auto &p : perm) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values"); } // Validate configured dst - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); @@ -82,10 +83,13 @@ ClPermuteKernel::ClPermuteKernel() _type = CLKernelType::ELEMENTWISE; } -void ClPermuteKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) +void ClPermuteKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); const TensorShape dst_shape = get_dst_shape(src, perm); // Output auto initialization if not yet initialized auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); @@ -96,7 +100,8 @@ void ClPermuteKernel::configure(const CLCompileContext &compile_context, const I // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type()))); build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2))); // New positions of width(W), height(H), channel(C) and batch(D) based on permutation vector build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0)); @@ -126,8 +131,9 @@ void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); @@ -144,9 +150,8 @@ void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com add_4D_tensor_argument(idx, src, slice_in); add_4D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice_in, lws_hint()); - } - while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); } } // namespace kernels } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClPermuteKernel.h b/src/gpu/cl/kernels/ClPermuteKernel.h index 0d349e739b..2413b10284 100644 --- a/src/gpu/cl/kernels/ClPermuteKernel.h +++ b/src/gpu/cl/kernels/ClPermuteKernel.h @@ -52,7 +52,10 @@ public: * @param[in] dst The dst tensor info. Data types supported: Same as @p src * @param[in] perm Permutation vector */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClPermuteKernel::configure() diff --git a/src/gpu/cl/kernels/ClPool2dKernel.cpp b/src/gpu/cl/kernels/ClPool2dKernel.cpp index a1afc585e0..41ab4d6922 100644 --- a/src/gpu/cl/kernels/ClPool2dKernel.cpp +++ b/src/gpu/cl/kernels/ClPool2dKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,37 +44,47 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2), - "Unsupported combination of parameters!"); - - const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const bool is_global_pooling = pool_info.is_global_pooling; - unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - int output_width = 0; - int output_height = 0; - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported"); - - std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], - pool_size_x, pool_size_y, pool_info.pad_stride_info); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2), + "Unsupported combination of parameters!"); + + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const bool is_global_pooling = pool_info.is_global_pooling; + unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + int output_width = 0; + int output_height = 0; + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info), + "Pooling region that is entirely outside input tensor is unsupported"); + + std::tie(output_width, output_height) = + scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size_x, + pool_size_y, pool_info.pad_stride_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), + "Calculated output dimension size is invalid"); // Check indices - if(indices) + if (indices) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, + "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), + "Pooling indices only supported for pool size 2x2"); - if(indices->total_size() != 0) + if (indices->total_size() != 0) { TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info); @@ -81,7 +92,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const } // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); @@ -98,42 +109,47 @@ ClPool2dKernel::ClPool2dKernel() _type = CLKernelType::POOL; } -void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) +void ClPool2dKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + ITensorInfo *indices) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices)); - auto padding_info = get_padding_info({ src, dst, indices }); + auto padding_info = get_padding_info({src, dst, indices}); // Auto init if empty TensorShape out_shape = compute_pool_shape(*src, pool_info); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape)); - if(indices) + if (indices) { auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32)); } // Set instance variables - _pool_info = pool_info; - _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - _num_elems_processed_per_iteration = (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4); + _pool_info = pool_info; + _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + _num_elems_processed_per_iteration = + (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4); _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0)); - int pool_stride_x = 0; - int pool_stride_y = 0; - const PoolingType pool_type = pool_info.pool_type; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); - const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; - const bool exclude_padding = pool_info.exclude_padding; + int pool_stride_x = 0; + int pool_stride_y = 0; + const PoolingType pool_type = pool_info.pool_type; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); + const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + const bool exclude_padding = pool_info.exclude_padding; std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_pad_top = pad_stride_info.pad_top(); - const int pool_pad_left = pad_stride_info.pad_left(); - const DataType data_type = src->data_type(); + const int pool_pad_top = pad_stride_info.pad_top(); + const int pool_pad_left = pad_stride_info.pad_left(); + const DataType data_type = src->data_type(); // Set build options CLBuildOptions build_opts; @@ -148,20 +164,23 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y)); build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width))); build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height))); - build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left))); - build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top))); + build_opts.add_option("-DMAX_WIDTH=" + + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left))); + build_opts.add_option("-DMAX_HEIGHT=" + + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top))); // Tensor paddings are used to calculate the indicies for MAX pooling - if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) + if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && + is_data_type_float(data_type)) { build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(src->tensor_shape().total_size_lower(3))); } - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { build_opts.add_option("-DQUANTIZED"); - if(src->quantization_info() != dst->quantization_info()) + if (src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -174,9 +193,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI } // Set the initial value for the pooling operation accordingly with the data type - if(pool_type == PoolingType::MAX) + if (pool_type == PoolingType::MAX) { - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { PixelValue type_min{}; std::tie(type_min, std::ignore) = get_min_max(data_type); @@ -184,7 +203,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI } else { - std::string initial_value = pool_info.use_inf_as_limit ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits::lowest()); + std::string initial_value = pool_info.use_inf_as_limit + ? "(-INFINITY)" + : float_to_string_with_full_precision(std::numeric_limits::lowest()); build_opts.add_option("-DINITIAL_VALUE=" + initial_value); } } @@ -195,22 +216,25 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI } // Create kernel - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: { const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision; const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX); - const auto acc_data_type = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : (is_data_type_quantized(data_type) ? DataType::S32 : data_type)); + const auto acc_data_type = get_cl_type_from_data_type( + use_wider_accumulator ? DataType::F32 + : (is_data_type_quantized(data_type) ? DataType::S32 : data_type)); build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type); build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION"); - if(pool_type != PoolingType::MAX) + if (pool_type != PoolingType::MAX) { build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); } - if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) + if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && + is_data_type_float(data_type)) { // For max pooling with pool2x2, store indicies which will be used in max unpooling std::string kernel_name = "pooling_layer_2_nchw_indices"; @@ -226,18 +250,19 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI case DataLayout::NHWC: { // Floating point mixed precision is support on F16 only - const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; + const auto use_fp_mixed_precision = + (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; // Wider accumulation is required to avoid accuracy loss // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation) // Cast 2: Quantized (int8/uint8 src data and int32 accumulation ) DataType acc_data_type = data_type; - if(use_fp_mixed_precision) + if (use_fp_mixed_precision) { acc_data_type = DataType::F32; } - else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) + else if (is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) { acc_data_type = DataType::S32; } @@ -250,8 +275,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height))); build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel))); build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size))); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); - if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type)) + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); + if (pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type)) { build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX"); @@ -260,7 +286,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI } else { - std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc"; + std::string kernel_name = is_data_type_quantized_asymmetric(data_type) + ? "pooling_layer_MxN_quantized_nhwc" + : "pooling_layer_MxN_nhwc"; _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); } break; @@ -290,7 +318,10 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status ClPool2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices)); return Status{}; @@ -301,18 +332,19 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - unsigned int pool_stride_x = 0; - unsigned int pool_stride_y = 0; + unsigned int pool_stride_x = 0; + unsigned int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride(); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_0)); - auto indices = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_1)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_0)); + auto indices = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_1)); // Collapse window Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: { @@ -323,13 +355,12 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm unsigned int idx = 0; add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); - if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2))) + if (indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2))) { add_3D_tensor_argument(idx, indices, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); break; } case DataLayout::NHWC: @@ -338,7 +369,8 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm Window slice = window_collapsed.first_slice_window_4D(); Window in_slice = window_collapsed.first_slice_window_4D(); - in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration)); + in_slice.set(Window::DimX, + Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration)); in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x)); in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y)); in_slice.set(3, Window::Dimension(0, batch_size, 1)); @@ -348,13 +380,13 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm unsigned int idx = 0; add_4D_tensor_argument(idx, src, in_slice); add_4D_tensor_argument(idx, dst, slice); - if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2))) + if (indices && is_data_type_float(src->info()->data_type()) && + (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2))) { add_4D_tensor_argument(idx, indices, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice)); + } while (window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice)); break; } default: diff --git a/src/gpu/cl/kernels/ClPool2dKernel.h b/src/gpu/cl/kernels/ClPool2dKernel.h index f5bb0687e8..56b95a37d5 100644 --- a/src/gpu/cl/kernels/ClPool2dKernel.h +++ b/src/gpu/cl/kernels/ClPool2dKernel.h @@ -50,22 +50,29 @@ public: * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + ITensorInfo *indices = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPool2dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; public: PoolingLayerInfo _pool_info{}; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - unsigned int _num_elems_processed_per_iteration{ 1 }; + DataLayout _data_layout{DataLayout::UNKNOWN}; + unsigned int _num_elems_processed_per_iteration{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClPool3dKernel.cpp b/src/gpu/cl/kernels/ClPool3dKernel.cpp index d068832fed..a08c5d4be7 100644 --- a/src/gpu/cl/kernels/ClPool3dKernel.cpp +++ b/src/gpu/cl/kernels/ClPool3dKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,10 +51,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported"); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0), "Strides cannot be zero."); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding - && (pool_info.pool_type == PoolingType::AVG)), + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0), + "Strides cannot be zero."); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && + (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)), "Exclude padding is unsupported for non-float types for Avg op"); const auto data_layout = src->data_layout(); @@ -68,17 +72,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const int output_height = 0; int output_depth = 0; - bool round_type_ceil_with_asymm_padding = (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding, "Cannot use dimension round type CEIL when padding is asymmetric."); + bool round_type_ceil_with_asymm_padding = + (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding, + "Cannot use dimension round type CEIL when padding is asymmetric."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported"); - std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], - src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, - pool_size_z, pool_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), + "Pooling region that is entirely outside input tensor is unsupported"); + std::tie(output_width, output_height, output_depth) = + scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], + src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), + "Calculated output dimension size is invalid"); // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); @@ -95,11 +103,14 @@ ClPool3dKernel::ClPool3dKernel() _type = CLKernelType::POOL; } -void ClPool3dKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info) +void ClPool3dKernel::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Auto init if empty TensorShape out_shape = compute_pool3d_shape(src->tensor_shape(), pool_info); @@ -112,23 +123,23 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT _num_elems_processed_per_iteration = (dst->data_type() == DataType::F32) ? 2 : 4; _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0)); - const PoolingType pool_type = pool_info.pool_type; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const int idx_depth = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH); - const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); - const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - const int pool_size_z = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth; - const bool exclude_padding = pool_info.exclude_padding; - const int pool_stride_x = pool_info.stride.x(); - const int pool_stride_y = pool_info.stride.y(); - const int pool_stride_z = pool_info.stride.z(); - const int pool_pad_top = pool_info.padding.top; - const int pool_pad_left = pool_info.padding.left; - const int pool_pad_front = pool_info.padding.front; - const DataType data_type = src->data_type(); + const PoolingType pool_type = pool_info.pool_type; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const int idx_depth = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH); + const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); + const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + const int pool_size_z = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth; + const bool exclude_padding = pool_info.exclude_padding; + const int pool_stride_x = pool_info.stride.x(); + const int pool_stride_y = pool_info.stride.y(); + const int pool_stride_z = pool_info.stride.z(); + const int pool_pad_top = pool_info.padding.top; + const int pool_pad_left = pool_info.padding.left; + const int pool_pad_front = pool_info.padding.front; + const DataType data_type = src->data_type(); // Set build options CLBuildOptions build_opts; @@ -149,7 +160,7 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(idx_depth))); // If datatype is quantized add relevant parameters - if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info()) + if (is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -161,9 +172,9 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT } // Set the initial value for the pooling operation accordingly with the data type - if(pool_type == PoolingType::MAX) + if (pool_type == PoolingType::MAX) { - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { PixelValue type_min{}; std::tie(type_min, std::ignore) = get_min_max(data_type); @@ -171,7 +182,8 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT } else { - build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits::lowest())); + build_opts.add_option("-DINITIAL_VALUE=" + + float_to_string_with_full_precision(std::numeric_limits::lowest())); } } else @@ -181,16 +193,18 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT } // Create kernel // Floating point mixed precision is support on F16 only - const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; + const auto use_fp_mixed_precision = + (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; // Wider accumulation is required to avoid accuracy loss // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation) DataType acc_data_type = data_type; - if(use_fp_mixed_precision) + if (use_fp_mixed_precision) { acc_data_type = DataType::F32; } - else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division + else if (is_data_type_quantized(data_type) && + pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division { acc_data_type = DataType::S32; } @@ -202,11 +216,13 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(dst->dimension(idx_depth))); build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel))); build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size))); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); // if datatype is quantized use quantized kernel function - std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized" : "pooling_3d_layer_MxN_ndhwc"); - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized" + : "pooling_3d_layer_MxN_ndhwc"); + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration)); @@ -240,8 +256,9 @@ void ClPool3dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_0)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_0)); // Collapse 3D window Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); diff --git a/src/gpu/cl/kernels/ClPool3dKernel.h b/src/gpu/cl/kernels/ClPool3dKernel.h index 00852349e6..6cd229c427 100644 --- a/src/gpu/cl/kernels/ClPool3dKernel.h +++ b/src/gpu/cl/kernels/ClPool3dKernel.h @@ -50,7 +50,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src. * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo. */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPool3dKernel::configure() @@ -64,8 +67,8 @@ public: private: Pooling3dLayerInfo _pool_info{}; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - unsigned int _num_elems_processed_per_iteration{ 1 }; + DataLayout _data_layout{DataLayout::UNKNOWN}; + unsigned int _num_elems_processed_per_iteration{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/gpu/cl/kernels/ClQuantizeKernel.cpp index 5c8bf97f0f..e8df420f67 100644 --- a/src/gpu/cl/kernels/ClQuantizeKernel.cpp +++ b/src/gpu/cl/kernels/ClQuantizeKernel.cpp @@ -29,13 +29,12 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -50,12 +49,14 @@ namespace Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); // Output must always be initialized ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QASYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); return Status{}; @@ -71,7 +72,7 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); @@ -84,7 +85,7 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const float scale_to_apply = qinfo.scale; int32_t offset_to_apply = qinfo.offset; - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { /* * In case of requantization of a quantized input tensor to an output tensor with another quantization @@ -132,8 +133,10 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type)); - build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(input_width_x - vec_size_x, 0))); - std::pair min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type); + build_opts.add_option_if( + multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(input_width_x - vec_size_x, 0))); + std::pair min_max_quant_values = + quantization::get_min_max_values_from_quantized_data_type(output_data_type); build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first)); build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second)); @@ -141,9 +144,10 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const // Configure kernel window Window win = calculate_max_window(*src, Steps()); - if(multi_access_x) + if (multi_access_x) { - win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); @@ -173,8 +177,7 @@ void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::Co add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClReshapeKernel.cpp b/src/gpu/cl/kernels/ClReshapeKernel.cpp index 121bb33edf..53889f3a6b 100644 --- a/src/gpu/cl/kernels/ClReshapeKernel.cpp +++ b/src/gpu/cl/kernels/ClReshapeKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" @@ -51,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(dst->tensor_shape().total_size() != 0) + if (dst->tensor_shape().total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); @@ -72,27 +73,17 @@ void ClReshapeKernel::configure(const CLCompileContext &compile_context, const I ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Create kernel - std::set build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()) }; + std::set build_opts = {"-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())}; _kernel = create_kernel(compile_context, "reshape_layer", build_opts); // Add static arguments - const cl_int2 src_shape = - { - { - static_cast(src->tensor_shape()[0]), - static_cast(src->tensor_shape()[1]) - } - }; - const cl_int2 dst_shape = - { - { - static_cast(dst->tensor_shape()[0]), - static_cast(dst->tensor_shape()[1]) - } - }; + const cl_int2 src_shape = { + {static_cast(src->tensor_shape()[0]), static_cast(src->tensor_shape()[1])}}; + const cl_int2 dst_shape = { + {static_cast(dst->tensor_shape()[0]), static_cast(dst->tensor_shape()[1])}}; unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters _kernel.setArg(idx++, src_shape); _kernel.setArg(idx++, dst_shape); @@ -119,8 +110,9 @@ void ClReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = window_collapsed.first_slice_window_3D(); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); // Set srcs unsigned int idx = 0; diff --git a/src/gpu/cl/kernels/ClReshapeKernel.h b/src/gpu/cl/kernels/ClReshapeKernel.h index db6ab5da58..95eae82086 100644 --- a/src/gpu/cl/kernels/ClReshapeKernel.h +++ b/src/gpu/cl/kernels/ClReshapeKernel.h @@ -58,7 +58,7 @@ public: // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; }; -} // namespace opencl } // namespace kernels +} // namespace opencl } // namespace arm_compute #endif /* ARM_COMPUTE_CL_RESHAPE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp index 4c4373a215..4305acad26 100644 --- a/src/gpu/cl/kernels/ClScaleKernel.cpp +++ b/src/gpu/cl/kernels/ClScaleKernel.cpp @@ -27,8 +27,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/InterpolationPolicyUtils.h" +#include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,7 +44,8 @@ namespace kernels { namespace { -inline std::tuple calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners) +inline std::tuple +calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners) { const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); @@ -64,20 +66,25 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::U8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(dst == src); - ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1); - ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1); + ARM_COMPUTE_RETURN_ERROR_ON( + info.align_corners && + !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && + !is_data_type_quantized_asymmetric(src->data_type())); float scale_x = 0.f; float scale_y = 0.f; const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners); + std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners); - ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f)); + ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && + (scale_x > 1.f || scale_y > 1.f)); return Status{}; } @@ -94,23 +101,26 @@ ClScaleKernel::ClScaleKernel() _type = CLKernelType::ELEMENTWISE; } -void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) +void ClScaleKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ScaleKernelInfo &info) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Info required for the static tuning _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; const bool is_nhwc = _data_layout == DataLayout::NHWC; - float scale_x = 0.f; - float scale_y = 0.f; + float scale_x = 0.f; + float scale_y = 0.f; std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners); // Area interpolation behaves as Nearest Neighbour in case of up-sampling auto interpolation_policy_to_use = info.interpolation_policy; - if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f) + if (info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f) { interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR; } @@ -127,7 +137,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn unsigned int vec_size_leftover = 0; CLBuildOptions build_opts; - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { vec_size = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels); vec_size_leftover = dst_channels % vec_size; @@ -135,7 +145,8 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER"); build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type())); + build_opts.add_option("-DCONSTANT_VALUE=" + + string_from_pixel_value(info.constant_border_value, src->data_type())); build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size)); build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover)); build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use)); @@ -144,27 +155,33 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT"); build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS"); build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT"); - build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT"); + build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", + "-DSAMPLING_POLICY_TOP_LEFT"); } - else if(_data_layout == DataLayout::NCHW) + else if (_data_layout == DataLayout::NCHW) { vec_size = adjust_vec_size(4, dst_width); vec_size_leftover = dst_width % vec_size; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type())); + build_opts.add_option("-DCONSTANT_VALUE=" + + string_from_pixel_value(info.constant_border_value, src->data_type())); build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width)); build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height)); build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x)); build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y)); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover))); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) + ? support::cpp11::to_string(vec_size) + : support::cpp11::to_string(vec_size_leftover))); build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE"); build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT"); build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS"); - build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT"); + build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", + "-DSAMPLING_POLICY_TOP_LEFT"); - const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR; - if(is_qasymm_bilinear) + const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && + info.interpolation_policy == InterpolationPolicy::BILINEAR; + if (is_qasymm_bilinear) { const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale)); @@ -190,7 +207,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); // Pass scale kernel arguments - if(is_nhwc) + if (is_nhwc) { unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc(); _kernel.setArg(idx++, scale_x); @@ -219,7 +236,7 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: { @@ -231,8 +248,7 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma add_2D_tensor_argument(idx, src, slice); add_2D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); break; } case DataLayout::NHWC: diff --git a/src/gpu/cl/kernels/ClScaleKernel.h b/src/gpu/cl/kernels/ClScaleKernel.h index dd09e92ee2..c09659017d 100644 --- a/src/gpu/cl/kernels/ClScaleKernel.h +++ b/src/gpu/cl/kernels/ClScaleKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_SCALE_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -49,7 +50,8 @@ public: * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info @ref ScaleKernelInfo Kernel descriptor to be used to configure. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClScaleKernel::configure() @@ -62,7 +64,7 @@ public: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - DataLayout _data_layout{ DataLayout::UNKNOWN }; + DataLayout _data_layout{DataLayout::UNKNOWN}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp index 59299fa441..1b5a2666bc 100644 --- a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp +++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp @@ -22,12 +22,14 @@ * SOFTWARE. */ #include "src/gpu/cl/kernels/ClSoftmaxKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Utils.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -60,15 +62,16 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float // Number of integer bits used in temporary fixed-point representation of exponent accumulator static const int exp_accumulation_in_bits = 12; - const double beta_multiplier = std::min( - 1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)), - (1LL << 31) - 1.0); + const double beta_multiplier = + std::min(1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)), (1LL << 31) - 1.0); int input_beta_multiplier; int input_beta_left_shift; - quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift); + quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, + &input_beta_left_shift); - const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift); - const int diff_min = -1.f * std::floor(max_input_rescaled); + const double max_input_rescaled = + 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift); + const int diff_min = -1.f * std::floor(max_input_rescaled); CLBuildOptions build_opts; build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits)); @@ -80,18 +83,22 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float return build_opts; } -Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum) +Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, + const ITensorInfo &max, + const ITensorInfo &dst, + const ITensorInfo &sum) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max); const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); // Checks performed when output is configured - if(dst.total_size() != 0) + if (dst.total_size() != 0) { - if(is_quantized_asymmetric) + if (is_quantized_asymmetric) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::S32); } @@ -103,9 +110,9 @@ Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensor } // Checks performed when sum is configured - if(sum.total_size() != 0) + if (sum.total_size() != 0) { - if(is_quantized_asymmetric) + if (is_quantized_asymmetric) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&sum, 1, DataType::S32); } @@ -119,7 +126,10 @@ Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensor return Status{}; } -Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info) +Status validate_arguments_1DNorm(const ITensorInfo &src, + const ITensorInfo &sum, + const ITensorInfo &dst, + const SoftmaxKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::S32, DataType::F16, DataType::F32); @@ -127,14 +137,15 @@ Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type)); // Note: output should always have a scale of 1/256 and offset 0 - const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log); - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); + const QuantizationInfo allowed_quantization_info = + get_softmax_output_quantization_info(info.input_data_type, info.is_log); + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); // Checks performed when output is configured - if(dst.total_size() != 0) + if (dst.total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); - if(!is_quantized_asymmetric) + if (!is_quantized_asymmetric) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); } @@ -161,9 +172,14 @@ ClLogits1DMaxShiftExpSumKernel::ClLogits1DMaxShiftExpSumKernel() _type = CLKernelType::ELEMENTWISE; } -void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info) +void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &max, + ITensorInfo &dst, + ITensorInfo &sum, + const SoftmaxKernelInfo &info) { - auto padding_info = get_padding_info({ &src, &max, &dst, &sum }); + auto padding_info = get_padding_info({&src, &max, &dst, &sum}); // Output auto initialization if not yet initialized auto_init_if_empty(sum, src.clone()->set_tensor_shape(max.tensor_shape())); @@ -191,15 +207,21 @@ void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size)))); build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE"); build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED"); - build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), + "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX"); - build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX"))); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DBETA=" + float_to_string_with_full_precision(beta)); - build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options()); + build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") + : std::string("-FLT_MAX"))); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), + "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), + "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), + prepare_quantized_softmax_build_options(qinfo.scale, beta).options()); cl::NDRange lws_hint(cl::NullRange); - std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial"; + std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial"; // Create kernel. _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); @@ -211,7 +233,10 @@ void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum) +Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, + const ITensorInfo &max, + const ITensorInfo &dst, + const ITensorInfo &sum) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum)); return Status{}; @@ -241,7 +266,7 @@ void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window & // Reconfigure window in case of parallel reduction ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(src->info()->dimension(0)); - if(std::get<0>(parallel_reduction_info)) + if (std::get<0>(parallel_reduction_info)) { // Launch grid_size parallel work items window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1)); @@ -258,8 +283,7 @@ void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window & add_3D_tensor_argument(idx, dst, slice); add_3D_tensor_argument(idx, sum, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } ClLogits1DNormKernel::ClLogits1DNormKernel() @@ -267,18 +291,24 @@ ClLogits1DNormKernel::ClLogits1DNormKernel() _type = CLKernelType::ELEMENTWISE; } -void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info) +void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + const ITensorInfo &sum, + ITensorInfo &dst, + const SoftmaxKernelInfo &info) { - auto padding_info = get_padding_info({ &src, &dst, &sum }); + auto padding_info = get_padding_info({&src, &dst, &sum}); // Note: output should always have a scale of 1/256 and offset 0 - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); - const DataType output_data_type = info.input_data_type; - const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log); - const UniformQuantizationInfo qinfo = src.quantization_info().uniform(); + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); + const DataType output_data_type = info.input_data_type; + const QuantizationInfo allowed_quantization_info = + get_softmax_output_quantization_info(info.input_data_type, info.is_log); + const UniformQuantizationInfo qinfo = src.quantization_info().uniform(); // Output auto initialization if not yet initialized - auto_init_if_empty(dst, src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info)); + auto_init_if_empty(dst, + src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info)); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(src, sum, dst, info)); @@ -311,7 +341,10 @@ void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, co ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClLogits1DNormKernel::validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info) +Status ClLogits1DNormKernel::validate(const ITensorInfo &src, + const ITensorInfo &sum, + const ITensorInfo &dst, + const SoftmaxKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(src, sum, dst, info)); @@ -343,9 +376,8 @@ void ClLogits1DNormKernel::run_op(ITensorPack &tensors, const Window &window, :: add_3D_tensor_argument(idx, sum, sum_slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.h b/src/gpu/cl/kernels/ClSoftmaxKernel.h index a221e12132..2dd53da346 100644 --- a/src/gpu/cl/kernels/ClSoftmaxKernel.h +++ b/src/gpu/cl/kernels/ClSoftmaxKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -61,14 +62,20 @@ public: * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p src * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &max, + ITensorInfo &dst, + ITensorInfo &sum, + const SoftmaxKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClLogits1DMaxShiftExpSumKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum); + static Status + validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum); /** Checks if the given size is eligible for parallel reduction * * @note Serial reduction is launched for width < (_grid_size * _serial_vector_size). @@ -100,14 +107,19 @@ public: * @param[out] dst Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + const ITensorInfo &sum, + ITensorInfo &dst, + const SoftmaxKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClLogits1DNormKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info); + static Status + validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp index 6450ffb5b2..6eb2bf81c0 100644 --- a/src/gpu/cl/kernels/ClTransposeKernel.cpp +++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp @@ -29,9 +29,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -58,12 +59,12 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Create kernel - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); + const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; - const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1)); + const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1)); const int vec_size_y_leftovers = src->dimension(1) % vec_size_y; CLBuildOptions build_opts; @@ -89,9 +90,10 @@ Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported"); // Validate configured dst - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo dst_info = src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src)); + const TensorInfo dst_info = + src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); @@ -106,8 +108,9 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window slice = window.first_slice_window_2D(); @@ -117,9 +120,8 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C add_2D_tensor_argument(idx, src, slice); add_2D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } } // namespace kernels } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp index ae825694c5..76f39ac500 100644 --- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp +++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp @@ -26,14 +26,14 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" - namespace arm_compute { namespace opencl @@ -42,11 +42,15 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, + DataType::QASYMM8_SIGNED, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(weights, DataLayout::NHWC); @@ -56,12 +60,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, constexpr unsigned int height_idx = 2; constexpr unsigned int batch_idx = 3; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); - if(biases != nullptr) + if (biases != nullptr) { - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -77,15 +82,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, } // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { const size_t input_width = input->dimension(width_idx); const size_t input_height = input->dimension(height_idx); const size_t weights_width = weights->dimension(width_idx); const size_t weights_height = weights->dimension(height_idx); - auto out_dims = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info); - TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); + auto out_dims = + deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info); + TensorShape output_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -96,8 +103,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, } } // namespace -void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info) +void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_UNUSED(biases, deconv_info); ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -119,7 +130,8 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co const size_t output_channels = output->dimension(channel_idx); // Calculate output shape - auto out_dims = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info); + auto out_dims = + deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info); TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->quantization_info()); @@ -147,7 +159,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co const DataType input_data_type = input->data_type(); const PaddingInfo strides = deconv_info.stride(); - if(biases != nullptr) + if (biases != nullptr) { build_options.add_option(std::string("-DHAS_BIAS")); build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type()))); @@ -180,7 +192,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); build_options.add_option_if((input_channels % k0) != 0, "-DLEFTOVER_LOOP"); - if(is_data_type_quantized(output_data_type)) + if (is_data_type_quantized(output_data_type)) { const UniformQuantizationInfo iqinfo = input->quantization_info().uniform(); const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); @@ -210,7 +222,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0)); } - if(compile_context.get_ddk_version() >= 30) + if (compile_context.get_ddk_version() >= 30) { build_options.add_option("-fregister-allocation=64"); } @@ -235,8 +247,11 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co _config_id += support::cpp11::to_string(n0); } -Status ClTransposedConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, - const ITensorInfo *dst, const PadStrideInfo &deconv_info) +Status ClTransposedConvolutionKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, deconv_info)); return Status{}; @@ -250,17 +265,20 @@ void ClTransposedConvolutionKernel::run_op(ITensorPack &tensors, const Window &w // Get initial windows Window slice = window.first_slice_window_3D(); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto weights = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto biases = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto weights = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto biases = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); unsigned int idx = 0; add_4d_tensor_nhwc_argument(idx, src); add_4d_tensor_nhwc_argument(idx, dst); add_4d_tensor_nhwc_argument(idx, weights); - if(biases != nullptr) + if (biases != nullptr) { add_1D_tensor_argument(idx, biases, slice); } diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h index d4350dda50..44f6f56b7a 100644 --- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h +++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h @@ -45,16 +45,23 @@ public: * Similar to @ref ClTransposedConvolution::configure() * */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClTransposedConvolution::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, - const ITensorInfo *output, const PadStrideInfo &deconv_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &deconv_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -63,4 +70,4 @@ public: } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp index 8f36345076..af80c4d796 100644 --- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp +++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp @@ -22,9 +22,11 @@ * SOFTWARE. */ #include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" @@ -39,7 +41,10 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *biases, + const ITensorInfo *output, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); @@ -48,20 +53,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, c ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1); ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0); - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type())); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1)); ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2)); - ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3])); - ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4])); + ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && + (biases->dimension(0) != input->tensor_shape()[3])); + ARM_COMPUTE_RETURN_ERROR_ON( + (input->num_dimensions() == 5) && + (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4])); } // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); } @@ -75,16 +84,21 @@ ClWeightsReshapeKernel::ClWeightsReshapeKernel() _type = CLKernelType::ELEMENTWISE; } -void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups) +void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *biases, + ITensorInfo *dst, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups))); + auto_init_if_empty( + *dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups))); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst, num_groups)); - auto padding_info = get_padding_info({ src, biases, dst }); + auto padding_info = get_padding_info({src, biases, dst}); const DataType data_type = src->data_type(); @@ -104,7 +118,10 @@ void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups) +Status ClWeightsReshapeKernel::validate(const ITensorInfo *src, + const ITensorInfo *biases, + const ITensorInfo *dst, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst, num_groups)); return Status{}; @@ -136,7 +153,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, _kernel.setArg(idx++, src->info()->dimension(3)); _kernel.setArg(idx++, dst->info()->strides_in_bytes().z()); - if(biases != nullptr) + if (biases != nullptr) { biases_window.use_tensor_dimensions(biases->info()->tensor_shape()); biases_slice = biases_window.first_slice_window_1D(); @@ -148,7 +165,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, unsigned idx = 0; add_3D_tensor_argument(idx, src, in_slice); add_2D_tensor_argument(idx, dst, out_slice); - if(biases != nullptr) + if (biases != nullptr) { add_1D_tensor_argument(idx, biases, biases_slice); ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice)); @@ -156,8 +173,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, // Run kernel enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice)); + } while (window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h index 7364eb97ae..5e05f8d006 100644 --- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h +++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h @@ -75,14 +75,19 @@ public: * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout * Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it. */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups = 1); + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *biases, + ITensorInfo *dst, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration * * Similar to ClWeightsReshapeKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1); + static Status + validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -90,4 +95,4 @@ public: } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */ \ No newline at end of file +#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp index 0a9a3f021f..15195025ce 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp +++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp @@ -29,11 +29,11 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/helpers/tensor_info.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -52,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0)); - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i)); ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i)); @@ -63,7 +63,8 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons } } // namespace -Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +Status +ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst)); return Status{}; @@ -74,12 +75,15 @@ ClWidthConcatenate2TensorsKernel::ClWidthConcatenate2TensorsKernel() _type = CLKernelType::ELEMENTWISE; } -void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst)); - auto padding_info = get_padding_info({ src1, src2, dst }); + auto padding_info = get_padding_info({src1, src2, dst}); const unsigned int min_dimension = std::min(src1->dimension(0), src2->dimension(0)); const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension); @@ -91,11 +95,12 @@ void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size())); - build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); + build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % + num_elems_processed_per_iteration)); // If input have different quantization info set quantization parameters needed for the re-quantization process const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2); - if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) + if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) { const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); @@ -146,9 +151,11 @@ void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window Window slice = window.first_slice_window_4D(); - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); + const auto src1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); do { @@ -159,8 +166,7 @@ void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window _kernel.setArg(idx++, _depth); _kernel.setArg(idx++, _input1_width); enqueue(queue, *this, window, lws_hint()); - } - while(window.slide_window_slice_4D(slice)); + } while (window.slide_window_slice_4D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h index 5c54479002..8b53d6d66b 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h +++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h @@ -62,8 +62,8 @@ public: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; private: - int32_t _depth{ 0 }; - int32_t _input1_width{ 0 }; + int32_t _depth{0}; + int32_t _input1_width{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp index 54f7ad344a..c4f84e3e45 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp +++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp @@ -30,11 +30,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/helpers/tensor_info.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -45,15 +45,20 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst) +Status validate_arguments(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *src3, + const ITensorInfo *src4, + const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > + dst->dimension(0)); - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i)); ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i)); @@ -71,22 +76,29 @@ ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel() _type = CLKernelType::ELEMENTWISE; } -Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst) +Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *src3, + const ITensorInfo *src4, + const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst)); return Status{}; } void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context, - ITensorInfo *src1, ITensorInfo *src2, - ITensorInfo *src3, ITensorInfo *src4, - ITensorInfo *dst) + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *src3, + ITensorInfo *src4, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst)); - auto padding_info = get_padding_info({ src1, src2, src3, src4, dst }); - const unsigned int min_dimension = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0))); + auto padding_info = get_padding_info({src1, src2, src3, src4, dst}); + const unsigned int min_dimension = + std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0))); const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension); const unsigned int vec_size_leftover = dst->dimension(0) % num_elems_processed_per_iteration; @@ -96,9 +108,14 @@ void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size())); - build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); - build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); - build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); + build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % + num_elems_processed_per_iteration)); + build_opts.add_option("-DINPUT2_ROTATE_N=" + + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % + num_elems_processed_per_iteration)); + build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + + src3->dimension(0) - vec_size_leftover) % + num_elems_processed_per_iteration)); _depth = src1->dimension(2); _input1_width = src1->dimension(0); @@ -106,8 +123,9 @@ void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile _input3_width = src3->dimension(0); // If soources have different quantization info set quantization parameters needed for the re-quantization process - const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4); - if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) + const bool have_different_qinfo = + helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4); + if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) { const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); @@ -166,11 +184,15 @@ void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); - const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2)); - const auto src3 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); + const auto src1 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); + const auto src2 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2)); + const auto src3 = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); Window slice = window.first_slice_window_4D(); @@ -187,8 +209,7 @@ void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window _kernel.setArg(idx++, _input2_width); _kernel.setArg(idx++, _input3_width); enqueue(queue, *this, window, lws_hint()); - } - while(window.slide_window_slice_4D(slice)); + } while (window.slide_window_slice_4D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h index baf8d381be..f589b8ac1a 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h +++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h @@ -52,23 +52,32 @@ public: * @param[in] src4 Fourth source tensor info. Data types supported: same as @p src1 * @param[out] dst Destination tensor info. Data types supported: same as @p src1. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *src3, + ITensorInfo *src4, + ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClWidthConcatenate4TensorsKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *src3, + const ITensorInfo *src4, + const ITensorInfo *dst); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; private: - int32_t _depth{ 0 }; - int32_t _input1_width{ 0 }; - int32_t _input2_width{ 0 }; - int32_t _input3_width{ 0 }; + int32_t _depth{0}; + int32_t _input1_width{0}; + int32_t _input2_width{0}; + int32_t _input3_width{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp index 2dfe7fce52..989de4a7b7 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp +++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -53,7 +53,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, con ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0)); - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); } @@ -74,12 +74,15 @@ Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int w return Status{}; } -void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst) +void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + unsigned int width_offset, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0)); @@ -87,10 +90,11 @@ void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset)); - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); @@ -121,8 +125,9 @@ void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); unsigned int idx = 0; add_4D_tensor_argument(idx, src, window); diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h index 3ace4400e6..c10d6a4dc6 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h +++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h @@ -50,7 +50,8 @@ public: * @param[in,out] dst Destination tensor info. Data types supported: same as @p src. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClWidthConcatenateKernel::configure() @@ -63,7 +64,7 @@ public: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; private: - int32_t _depth{ 0 }; + int32_t _depth{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp index 7148a4c85c..58c01d4da5 100644 --- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp @@ -29,10 +29,11 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -60,14 +61,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported"); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), + "Winograd filter transform not supported"); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || + input->dimension(idx_h) != kernel_size.height); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -81,11 +86,15 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_UNUSED(output); - const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1; + const unsigned int num_elems_processed_per_iteration_x = + input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1; const unsigned int num_elems_processed_per_iteration_y = input->dimension(1); - const unsigned int num_elems_read_per_iteration_z = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2); + const unsigned int num_elems_read_per_iteration_z = + input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2); - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z)); + Window win = + calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, + num_elems_read_per_iteration_z)); Window win_collapsed = win.collapse(win, Window::DimZ); return std::make_pair(Status{}, win_collapsed); } @@ -96,21 +105,25 @@ ClWinogradFilterTransformKernel::ClWinogradFilterTransformKernel() _type = CLKernelType::WINOGRAD; } -void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info) +void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const WinogradInfo &winograd_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info))); + auto_init_if_empty(*dst, + src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Set build options CLBuildOptions build_opts; // For NHWC layouts pass tensor dimesions at runtime - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { _src_dim_z = src->dimension(2); } @@ -125,7 +138,8 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_ const Size2D output_tile_size = winograd_info.output_tile_size; // Create kernel - std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout())); + std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout())); // A macro guard to compile ONLY the kernel of interest build_opts.add_option("-D" + upper_string(kernel_name)); @@ -138,7 +152,9 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info) +Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const WinogradInfo &winograd_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); @@ -161,7 +177,7 @@ void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window unsigned int idx = 0; add_4D_tensor_argument(idx, src, window); add_3D_tensor_argument(idx, dst, window_out); - if(src->info()->data_layout() == DataLayout::NHWC) + if (src->info()->data_layout() == DataLayout::NHWC) { _kernel.setArg(idx++, _src_dim_z); } diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h index b2130304e6..6e439f0c99 100644 --- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h +++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -59,7 +60,10 @@ public: * @param[out] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const WinogradInfo &winograd_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClWinogradFilterTransformKernel::configure() @@ -72,7 +76,7 @@ public: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - int32_t _src_dim_z{ 0 }; + int32_t _src_dim_z{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp index fab6c36032..54c48986fc 100644 --- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp @@ -32,6 +32,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -55,17 +56,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const PadStrideInfo conv_info = winograd_info.convolution_info; const Size2D output_tile_size = winograd_info.output_tile_size; const Size2D kernel_size = winograd_info.kernel_size; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, + "Winograd input transform only supports unit strides"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), + "Winograd input transform not supported"); ARM_COMPUTE_UNUSED(conv_info); ARM_COMPUTE_UNUSED(output_tile_size); ARM_COMPUTE_UNUSED(kernel_size); // Validate configured output - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); + const TensorShape output_shape = + misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -74,7 +79,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) +std::pair +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) { ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -82,7 +88,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen bool window_changed = false; int num_elems_processed_per_iteration = 1; - if(input->data_layout() == DataLayout::NHWC) + if (input->data_layout() == DataLayout::NHWC) { // In the case of FP16 computation, we can perform more // output feature maps in a single work-item. @@ -94,9 +100,9 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen const size_t dim0 = input->dimension(0); const size_t k_sz = winograd_info.kernel_size.area(); const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0); - if(cond) + if (cond) { - if(k_sz == 3 || k_sz == 9) + if (k_sz == 3 || k_sz == 9) { num_elems_processed_per_iteration = 2; } @@ -104,7 +110,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - if(input->data_layout() == DataLayout::NCHW) + if (input->data_layout() == DataLayout::NCHW) { const PadStrideInfo conv_info = winograd_info.convolution_info; const Size2D output_tile_size = winograd_info.output_tile_size; @@ -113,11 +119,13 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1; unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1; - AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y); + AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), + num_elems_read_per_iteration_x, num_elems_read_per_iteration_y); window_changed = update_window_and_padding(win, input_access); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -132,12 +140,15 @@ BorderSize ClWinogradInputTransformKernel::border_size() const return _border_size; } -void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info) +void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const WinogradInfo &winograd_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); const PadStrideInfo conv_info = winograd_info.convolution_info; const Size2D output_tile_size = winograd_info.output_tile_size; @@ -150,14 +161,13 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c // Compute the number of output tiles along the x and y direction of size "output_tile_size" const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)), - kernel_size, - output_tile_size, - conv_info); + kernel_size, output_tile_size, conv_info); _num_tiles_x = num_tiles.width; _num_tiles_y = num_tiles.height; - const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); + const TensorShape output_shape = + misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); // Output auto initialization if not yet initialized auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape)); @@ -174,7 +184,7 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c _src_height = src->dimension(idx_h); CLBuildOptions build_opts; - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { build_opts.add_option("-DNHWC"); build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); @@ -201,13 +211,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c } // Create kernel - std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string(); + std::string kernel_name = + "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string(); // Get the maximum dimension from the tile size const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height); // Check optimized kernel if output_dims == 2x2 - if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW)) + if ((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW)) { _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2; } @@ -239,11 +250,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c _config_id += lower_string(string_from_data_layout(_data_layout)); } -Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info) +Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const WinogradInfo &winograd_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first); return Status{}; } @@ -263,7 +277,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window & // Collapse window Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { Window slice = window_collapsed.first_slice_window_3D(); slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1)); @@ -298,8 +312,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window & add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } } } // namespace kernels diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h index c10c528b9b..cebebea1d3 100644 --- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h +++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -59,7 +60,10 @@ public: * @param[in] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const WinogradInfo &winograd_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClWinogradInputTransformKernel::configure() @@ -69,19 +73,19 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; BorderSize border_size() const override; private: using WinogradKey = std::pair, std::pair>; - BorderSize _border_size{ 0 }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - int _num_tiles_x{ 0 }; - int _num_tiles_y{ 0 }; - unsigned int _step_z{ 1 }; - int32_t _src_width{ 0 }; - int32_t _src_height{ 0 }; + BorderSize _border_size{0}; + DataLayout _data_layout{DataLayout::UNKNOWN}; + int _num_tiles_x{0}; + int _num_tiles_y{0}; + unsigned int _step_z{1}; + int32_t _src_width{0}; + int32_t _src_height{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp index bf974d30d8..89c80c55ef 100644 --- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp @@ -23,7 +23,6 @@ */ #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -31,10 +30,12 @@ #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -54,7 +55,11 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const WinogradInfo &winograd_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); @@ -66,30 +71,32 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con const Size2D output_tile_size = winograd_info.output_tile_size; const Size2D kernel_size = winograd_info.kernel_size; const Size2D input_dimensions = winograd_info.input_dimensions; - const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1); + const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * + (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), + "Winograd output transform not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels"); // Compute number of elements to process in the X and Y direction // Compute the number of output tiles along the x and y direction of size "output_tile_size" - const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, - kernel_size, - output_tile_size, - conv_info); + const Size2D num_tiles = + compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast((num_tiles.area()))); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); } // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -98,14 +105,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size) +std::pair validate_and_configure_window(ITensorInfo *input, + ITensorInfo *bias, + ITensorInfo *output, + const Size2D &output_tile_size) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_UNUSED(bias); unsigned int num_elems_processed_per_iteration = 1; - if(input->data_layout() == DataLayout::NHWC) + if (input->data_layout() == DataLayout::NHWC) { // In the case of FP16 computation, we can perform more // output feature maps in a single work-item. @@ -115,7 +125,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen const DataType dt = input->data_type(); const size_t dim0 = input->dimension(0); const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0); - if(cond) + if (cond) { num_elems_processed_per_iteration = 2; } @@ -124,17 +134,19 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); bool window_changed = false; - if(output->data_layout() == DataLayout::NCHW) + if (output->data_layout() == DataLayout::NCHW) { const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width); const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height); - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, + num_elems_processed_per_iteration); AccessWindowStatic output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y); window_changed = update_window_and_padding(win, input_access, output_access); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -144,13 +156,18 @@ ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel() _type = CLKernelType::WINOGRAD; } -void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info, +void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info))); + auto_init_if_empty(*dst, + src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info)); @@ -159,7 +176,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ ARM_COMPUTE_ERROR_THROW_ON(win_config.first); IClKernel::configure_internal(win_config.second); - auto padding_info = get_padding_info({ src, bias, dst }); + auto padding_info = get_padding_info({src, bias, dst}); _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC; @@ -168,14 +185,13 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ const Size2D kernel_size = winograd_info.kernel_size; const Size2D output_tile_size = winograd_info.output_tile_size; const PadStrideInfo conv_info = winograd_info.convolution_info; - const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT); + const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = + get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT); // Compute the number of output tiles along the x and y direction of size "output_tile_size" - const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, - kernel_size, - output_tile_size, - conv_info); + const Size2D num_tiles = + compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info); const size_t total_batches = dst->tensor_shape().total_size_upper(3); // Set build options @@ -184,11 +200,11 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); - if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2)) + if ((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2)) { build_opts.add_option("-DVEC_SIZE=2"); } - else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4)) + else if ((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4)) { build_opts.add_option("-DVEC_SIZE=4"); } @@ -200,9 +216,10 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ const auto act_function = act_info.activation(); const auto src_data_type = src->data_type(); - if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (src_data_type == DataType::F32 || src_data_type == DataType::F16)) + if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || + act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && + (src_data_type == DataType::F32 || src_data_type == DataType::F16)) { // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations @@ -213,7 +230,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ build_opts.add_option("-cl-fast-relaxed-math"); } - if(_is_nhwc) + if (_is_nhwc) { build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS")); build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); @@ -247,7 +264,9 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ _dst_height = dst->dimension(idx_height); // Create kernel - std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout)); + std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + + kernel_size.to_string() + "_" + + lower_string(string_from_data_layout(winograd_info.output_data_layout)); // A macro guard to compile ONLY the kernel of interest build_opts.add_option("-D" + upper_string(kernel_name)); @@ -271,10 +290,18 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc); } -Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) +Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const WinogradInfo &winograd_info, + const ActivationLayerInfo &act_info) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), dst->clone().get(), winograd_info.output_tile_size).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), + (bias != nullptr ? bias->clone().get() : nullptr), + dst->clone().get(), winograd_info.output_tile_size) + .first); return Status{}; } @@ -299,7 +326,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - if(bias != nullptr) + if (bias != nullptr) { unsigned int idx1 = 2 * num_arguments_per_4D_tensor(); Window slice_biases; @@ -307,7 +334,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window add_1D_tensor_argument(idx1, bias, slice_biases); } - if(_is_nhwc) + if (_is_nhwc) { unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0); _kernel.setArg(idx2++, static_cast(dst->info()->total_size() - dst->info()->strides_in_bytes().y())); @@ -322,8 +349,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window add_4D_tensor_argument(idx, src, slice); add_4D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h index 6f018967d0..65bb963061 100644 --- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h +++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -61,7 +62,11 @@ public: * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -69,7 +74,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const WinogradInfo &winograd_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -77,11 +86,11 @@ public: private: using WinogradKey = std::pair, std::pair>; - bool _is_nhwc{ false }; - int32_t _src_height{ 0 }; - int32_t _dst_width{ 0 }; - int32_t _dst_height{ 0 }; - int32_t _num_tiles_x{ 0 }; + bool _is_nhwc{false}; + int32_t _src_height{0}; + int32_t _dst_width{0}; + int32_t _dst_height{0}; + int32_t _num_tiles_x{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp index 9350bf74bb..b5ebac3b49 100644 --- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp +++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp @@ -39,14 +39,24 @@ namespace kernels { namespace gemm { -std::pair configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, - bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image) +std::pair configure_lhs_rhs_info(unsigned int m, + unsigned int n, + unsigned int m0, + unsigned int n0, + unsigned int k0, + unsigned int v0, + unsigned int h0, + bool lhs_interleave, + bool rhs_interleave, + bool lhs_transpose, + bool rhs_transpose, + bool export_to_cl_image) { ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0); ARM_COMPUTE_ERROR_ON(v0 == 0); v0 = std::max(std::min(static_cast(m / m0), static_cast(v0)), static_cast(1)); - if(h0 == 0) + if (h0 == 0) { // When h0 is 0, we should take the maximum H0 possible h0 = std::max(n / n0, 1U); @@ -62,17 +72,22 @@ std::pair configure_lhs_rhs_info(unsigned return std::make_pair(lhs_info, rhs_info); } -std::pair select_lhs_rhs_info(std::pair info_img, - std::pair info_buf, - unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair +select_lhs_rhs_info(std::pair info_img, + std::pair info_buf, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type) { - ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true, "The fallback GeMM configuration cannot have export_to_cl_image = true"); + ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true, + "The fallback GeMM configuration cannot have export_to_cl_image = true"); const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, data_type); const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second); const TensorInfo tensor_reshaped_info(shape, 1, data_type); - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second))) + if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second))) { return info_img; } @@ -90,42 +105,56 @@ void update_padding_for_cl_image(ITensorInfo *tensor) const unsigned int pixel_alignment = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()); ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment"); - if(pixel_alignment == 0) + if (pixel_alignment == 0) { return; } const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel; - const unsigned int round_up_width = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment; - const unsigned int padding = round_up_width - stride_y_in_elements; + const unsigned int round_up_width = + ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment; + const unsigned int padding = round_up_width - stride_y_in_elements; tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0)); } Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info) { - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false, "Export to cl_image only supported with n0 = 4, 8 or 16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true, "Export to cl_image only supported with k0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false, + "Export to cl_image only supported with n0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true, + "Export to cl_image only supported with k0 = 4, 8 or 16"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), + "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, + "Impossible to retrieve the cl_image pitch alignment"); // Check the width and height of the output tensor. // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, + "Not supported width for cl_image"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, + "Not supported height for cl_image"); } return Status{}; } -bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b, - const DataType data_type, unsigned int &best_m0, unsigned int &best_n0) +bool is_mmul_kernel_preferred(const unsigned int m, + const unsigned int n, + const unsigned int k, + const unsigned int b, + const DataType data_type, + unsigned int &best_m0, + unsigned int &best_n0) { ARM_COMPUTE_UNUSED(n, k, b, data_type); @@ -141,7 +170,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const return ((k % mmul_k0) == 0) && (gws_y > 4); } -std::pair find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b) { size_t min_acc = std::numeric_limits::max(); size_t min_idx = 0; @@ -150,12 +180,13 @@ std::pair find_lhs_rhs_info(const GeMMConf const size_t num_rows = configs.size(); const size_t num_cols = configs[0].size(); - ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS"); + ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, " + "N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS"); ARM_COMPUTE_UNUSED(num_cols); // Find nearest GeMM workload // Note: the workload does not depend on the K dimension - for(size_t y = 0; y < num_rows; ++y) + for (size_t y = 0; y < num_rows; ++y) { size_t mc0 = static_cast(configs[y][0]); size_t nc0 = static_cast(configs[y][1]); @@ -168,7 +199,7 @@ std::pair find_lhs_rhs_info(const GeMMConf acc += (k - kc0) * (k - kc0); acc += (b - bc0) * (b - bc0); acc = std::sqrt(acc); - if(acc < min_acc) + if (acc < min_acc) { min_acc = acc; min_idx = y; diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h index 6689b10e69..84776fb207 100644 --- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h +++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h @@ -54,8 +54,18 @@ using GeMMConfigsMatrix = std::vector>; * * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo */ -std::pair configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, - bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false); +std::pair configure_lhs_rhs_info(unsigned int m, + unsigned int n, + unsigned int m0, + unsigned int n0, + unsigned int k0, + unsigned int v0, + unsigned int h0, + bool lhs_interleave, + bool rhs_interleave, + bool lhs_transpose, + bool rhs_transpose, + bool export_to_cl_image = false); /** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo * @@ -72,9 +82,13 @@ std::pair configure_lhs_rhs_info(unsigned * * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo */ -std::pair select_lhs_rhs_info(std::pair info_img, - std::pair info_buf, - unsigned int n, unsigned int k, unsigned int b, DataType data_type); +std::pair +select_lhs_rhs_info(std::pair info_img, + std::pair info_buf, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type); /** Update padding required to export the OpenCL buffer to OpenCL image2d * @@ -103,8 +117,13 @@ Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, * * @return true if MMUL kernel is preferred over kernels w/o MMUL, false otherwise */ -bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b, - const DataType data_type, unsigned int &best_m0, unsigned int &best_n0); +bool is_mmul_kernel_preferred(const unsigned int m, + const unsigned int n, + const unsigned int k, + const unsigned int b, + const DataType data_type, + unsigned int &best_m0, + unsigned int &best_n0); /** Find the preferred configurations for the LHS and RHS tensor using the GeMMConfigsMatrix provided by the user * @@ -116,7 +135,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const * * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo */ -std::pair find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b); +std::pair +find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b); } // namespace gemm } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h index a49836cfda..9d08633963 100644 --- a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h @@ -26,6 +26,7 @@ #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" #include @@ -56,8 +57,7 @@ public: * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) * */ - CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) - : _configs{ func_f32, func_f16, func_int8 } + CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} { } @@ -69,7 +69,7 @@ public: */ T get_function(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return _configs.at(DT_F32); @@ -96,8 +96,7 @@ public: * * @param[in] arch GPU target */ - IClGemmKernelConfig(GPUTarget arch) - : _target(arch) + IClGemmKernelConfig(GPUTarget arch) : _target(arch) { } ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig); @@ -111,7 +110,8 @@ public: * @param[in] b Batch size * @param[in] data_type Data type */ - virtual std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0; + virtual std::pair + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0; protected: GPUTarget _target; diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp index d74c7fac9b..2f37eef31f 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include @@ -38,31 +39,34 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair ClGemmDefaultConfigNativeBifrost::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair ( + ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32, - &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_G71_u8); + CLGEMMConfigArray configs_G71( + &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, + &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_G71_u8); - CLGEMMConfigArray configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32, - &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_G76_u8); + CLGEMMConfigArray configs_G76( + &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, + &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_G76_u8); - CLGEMMConfigArray configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32, - &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_default_u8); + CLGEMMConfigArray configs_G7x( + &ClGemmDefaultConfigNativeBifrost::configure_default_f32, + &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_default_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G76: func = configs_G76.get_function(data_type); @@ -79,18 +83,19 @@ std::pair ClGemmDefaultConfigNativeBifrost return (this->*func)(m, n, k, b); } -std::pair ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 8192) + else if (n >= 2048 && n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); } @@ -105,20 +110,21 @@ std::pair ClGemmDefaultConfigNativeBifrost } } -std::pair ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 16384) + else if (n >= 2048 && n < 16384) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -129,7 +135,7 @@ std::pair ClGemmDefaultConfigNativeBifrost } else { - if(m < 64) + if (m < 64) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); } @@ -141,9 +147,9 @@ std::pair ClGemmDefaultConfigNativeBifrost } else { - if(m == 1) + if (m == 1) { - if(n < 8192) + if (n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -159,24 +165,25 @@ std::pair ClGemmDefaultConfigNativeBifrost } } -std::pair ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n > 4196) + if (n > 4196) { return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false); } else { - if(k < 2048) + if (k < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false); } - else if(k >= 2048 && k < 16384) + else if (k >= 2048 && k < 16384) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } @@ -192,18 +199,19 @@ std::pair ClGemmDefaultConfigNativeBifrost } } -std::pair ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 16384) + else if (n >= 2048 && n < 16384) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -214,7 +222,7 @@ std::pair ClGemmDefaultConfigNativeBifrost } else { - if(m < 64) + if (m < 64) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); } @@ -225,7 +233,8 @@ std::pair ClGemmDefaultConfigNativeBifrost } } -std::pair ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -233,7 +242,8 @@ std::pair ClGemmDefaultConfigNativeBifrost return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false); } -std::pair ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -243,4 +253,4 @@ std::pair ClGemmDefaultConfigNativeBifrost } // namespace gemm } // namespace kernels } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h index 9af5dc4135..f822daae53 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h @@ -45,15 +45,22 @@ public: ClGemmDefaultConfigNativeBifrost(GPUTarget gpu); // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp index b9f36c7210..f87fb1b659 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include @@ -38,18 +39,17 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair ClGemmDefaultConfigNativeMidgard::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair ( + ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray configs_default(nullptr, - nullptr, + CLGEMMConfigArray configs_default(nullptr, nullptr, &ClGemmDefaultConfigNativeMidgard::default_q8); auto func = configs_default.get_function(data_type); @@ -57,7 +57,8 @@ std::pair ClGemmDefaultConfigNativeMidgard return (this->*func)(m, n, k, b); } -std::pair ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -70,4 +71,4 @@ std::pair ClGemmDefaultConfigNativeMidgard } // namespace gemm } // namespace kernels } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h index c055753c48..fa76c5dba7 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h @@ -45,10 +45,12 @@ public: ClGemmDefaultConfigNativeMidgard(GPUTarget gpu); // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp index 95a4d2bd69..97a1298b0a 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include @@ -38,37 +39,38 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair ClGemmDefaultConfigNativeValhall::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair ( + ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32, - &ClGemmDefaultConfigNativeValhall::configure_G77_f16, - &ClGemmDefaultConfigNativeValhall::configure_G77_u8); + CLGEMMConfigArray configs_default( + &ClGemmDefaultConfigNativeValhall::configure_G77_f32, &ClGemmDefaultConfigNativeValhall::configure_G77_f16, + &ClGemmDefaultConfigNativeValhall::configure_G77_u8); auto func = configs_default.get_function(data_type); ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); return (this->*func)(m, n, k, b); } -std::pair ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 8192) + else if (n >= 2048 && n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); } @@ -83,18 +85,19 @@ std::pair ClGemmDefaultConfigNativeValhall } } -std::pair ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 8192) + else if (n >= 2048 && n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); } @@ -109,20 +112,21 @@ std::pair ClGemmDefaultConfigNativeValhall } } -std::pair ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 16384) + else if (n >= 2048 && n < 16384) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -133,7 +137,7 @@ std::pair ClGemmDefaultConfigNativeValhall } else { - if(m < 64) + if (m < 64) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); } @@ -145,9 +149,9 @@ std::pair ClGemmDefaultConfigNativeValhall } else { - if(m == 1) + if (m == 1) { - if(n < 8192) + if (n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -165,4 +169,4 @@ std::pair ClGemmDefaultConfigNativeValhall } // namespace gemm } // namespace kernels } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h index f0f812fd46..c91b095279 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h @@ -45,12 +45,16 @@ public: ClGemmDefaultConfigNativeValhall(GPUTarget gpu); // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h index cf8412830b..955bb3c01a 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h @@ -51,7 +51,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: return std::make_unique(gpu); diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp index 657018eb53..c956c347ef 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include @@ -43,30 +44,31 @@ namespace gemm { using namespace arm_compute::misc::shape_calculator; -ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair ClGemmDefaultConfigReshapedBifrost::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair ( + ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); + CLGEMMConfigArray configs_G7x( + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); - CLGEMMConfigArray configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); + CLGEMMConfigArray configs_G52( + &ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); - CLGEMMConfigArray configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8); + CLGEMMConfigArray configs_G76( + &ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G76: func = configs_G76.get_function(data_type); @@ -83,12 +85,13 @@ std::pair ClGemmDefaultConfigReshapedBifro return (this->*func)(m, n, k, b); } -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); } @@ -98,12 +101,13 @@ std::pair ClGemmDefaultConfigReshapedBifro } } -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false); } @@ -113,14 +117,15 @@ std::pair ClGemmDefaultConfigReshapedBifro } } -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true); } @@ -131,7 +136,7 @@ std::pair ClGemmDefaultConfigReshapedBifro } else { - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true); } @@ -142,7 +147,8 @@ std::pair ClGemmDefaultConfigReshapedBifro } } -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast(m) / static_cast(n); const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; @@ -154,100 +160,108 @@ std::pair ClGemmDefaultConfigReshapedBifro GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - if(workload <= 274.4000f) + if (workload <= 274.4000f) { - if(r_nk <= 0.7461f) + if (r_nk <= 0.7461f) { - if(r_mn <= 21.1667f) + if (r_mn <= 21.1667f) { return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(r_mk <= 17.3926f) + if (r_mk <= 17.3926f) { - if(workload <= 542.4000f) + if (workload <= 542.4000f) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(r_nk <= 0.5463f) + if (r_nk <= 0.5463f) { - if(workload <= 11767.6001f) + if (workload <= 11767.6001f) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } } } -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - if(workload <= 323.4000f) + if (workload <= 323.4000f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false); } @@ -257,7 +271,8 @@ std::pair ClGemmDefaultConfigReshapedBifro } } -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -268,7 +283,7 @@ std::pair ClGemmDefaultConfigReshapedBifro GEMMRHSMatrixInfo rhs_info_img; // Get lhs_info/rhs_info in case of OpenCL buffer - if(n <= 4) + if (n <= 4) { std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); } @@ -279,15 +294,17 @@ std::pair ClGemmDefaultConfigReshapedBifro // Get lhs_info/rhs_info in case of OpenCL image // Condition on the GPU workload - if((m / 4) * (n / 4) >= 2560) + if ((m / 4) * (n / 4) >= 2560) { // Big workload - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true); } else { // Small workload - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true); } const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); @@ -297,7 +314,7 @@ std::pair ClGemmDefaultConfigReshapedBifro // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d const bool use_cl_image2d = (n <= 4) ? false : true; - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) { return std::make_pair(lhs_info_img, rhs_info_img); } @@ -307,16 +324,17 @@ std::pair ClGemmDefaultConfigReshapedBifro } } -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; const float r_mk = static_cast(m) / static_cast(k); - if(workload <= 1595.2000f) + if (workload <= 1595.2000f) { - if(r_mk <= 2.1044f) + if (r_mk <= 2.1044f) { - if(workload <= 870.4000f) + if (workload <= 870.4000f) { return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false); } @@ -336,12 +354,13 @@ std::pair ClGemmDefaultConfigReshapedBifro } } -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true); } diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h index d86d1ba0a7..9227ec2551 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h @@ -45,17 +45,26 @@ public: ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu); // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp index 58d0873b86..70b324eb5a 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include @@ -38,26 +39,27 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair ClGemmDefaultConfigReshapedValhall::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair ( + ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32, - &ClGemmDefaultConfigReshapedValhall::configure_G77_f16, - &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); + CLGEMMConfigArray configs_G77( + &ClGemmDefaultConfigReshapedValhall::configure_G77_f32, &ClGemmDefaultConfigReshapedValhall::configure_G77_f16, + &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); - CLGEMMConfigArray configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32, - &ClGemmDefaultConfigReshapedValhall::configure_G78_f16, - &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); + CLGEMMConfigArray configs_G78( + &ClGemmDefaultConfigReshapedValhall::configure_G78_f32, &ClGemmDefaultConfigReshapedValhall::configure_G78_f16, + &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G78: func = configs_G78.get_function(data_type); @@ -72,12 +74,13 @@ std::pair ClGemmDefaultConfigReshapedValha return (this->*func)(m, n, k, b); } -std::pair ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1); } @@ -87,7 +90,8 @@ std::pair ClGemmDefaultConfigReshapedValha } } -std::pair ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -104,17 +108,17 @@ std::pair ClGemmDefaultConfigReshapedValha std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0); - if(r_mk <= 0.11824845522642136) + if (r_mk <= 0.11824845522642136) { - if(workload <= 880.0) + if (workload <= 880.0) { return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); } else { - if(r_nk <= 0.42521367967128754) + if (r_nk <= 0.42521367967128754) { - if(workload <= 1726.4000244140625) + if (workload <= 1726.4000244140625) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0); } @@ -123,13 +127,12 @@ std::pair ClGemmDefaultConfigReshapedValha std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } else { - if(workload <= 1241.6000366210938) + if (workload <= 1241.6000366210938) { return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); } @@ -142,17 +145,16 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(workload <= 11404.7998046875) + if (workload <= 11404.7998046875) { - if(r_mk <= 1.0126488208770752) + if (r_mk <= 1.0126488208770752) { - if(r_mn <= 2.545312523841858) + if (r_mn <= 2.545312523841858) { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { @@ -161,43 +163,39 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(workload <= 2881.199951171875) + if (workload <= 2881.199951171875) { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } else { - if(r_nk <= 0.5765306055545807) + if (r_nk <= 0.5765306055545807) { - if(r_mn <= 6.010416746139526) + if (r_mn <= 6.010416746139526) { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } else @@ -205,27 +203,27 @@ std::pair ClGemmDefaultConfigReshapedValha std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } } -std::pair ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast(m) / static_cast(n); const float r_mk = static_cast(m) / static_cast(k); const float r_nk = static_cast(n) / static_cast(k); const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - if(workload <= 1288.0000f) + if (workload <= 1288.0000f) { - if(workload <= 505.6000f) + if (workload <= 505.6000f) { - if(r_mn <= 0.4466f) + if (r_mn <= 0.4466f) { - if(r_nk <= 0.2384f) + if (r_nk <= 0.2384f) { return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); } @@ -241,9 +239,9 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(r_mn <= 0.2250f) + if (r_mn <= 0.2250f) { - if(r_mn <= 0.1599f) + if (r_mn <= 0.1599f) { return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); } @@ -254,11 +252,11 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 0.7609f) + if (r_mk <= 0.7609f) { - if(r_mn <= 2.5453f) + if (r_mn <= 2.5453f) { - if(workload <= 1089.6000f) + if (workload <= 1089.6000f) { return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); } @@ -281,29 +279,29 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(workload <= 5434.4001f) + if (workload <= 5434.4001f) { - if(workload <= 1603.2000f) + if (workload <= 1603.2000f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(r_nk <= 0.6192f) + if (r_nk <= 0.6192f) { - if(r_mn <= 16.1016f) + if (r_mn <= 16.1016f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(workload <= 2750.0000f) + if (workload <= 2750.0000f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(r_mk <= 6.3151f) + if (r_mk <= 6.3151f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); } @@ -316,15 +314,15 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 0.0387f) + if (r_mk <= 0.0387f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); } else { - if(r_mk <= 2.5859f) + if (r_mk <= 2.5859f) { - if(r_mk <= 0.2734f) + if (r_mk <= 0.2734f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); } @@ -343,13 +341,13 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 25.7500f) + if (r_mk <= 25.7500f) { - if(r_mk <= 0.3615f) + if (r_mk <= 0.3615f) { - if(r_mn <= 0.0913f) + if (r_mn <= 0.0913f) { - if(r_mk <= 0.0683f) + if (r_mk <= 0.0683f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); } @@ -365,15 +363,15 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(workload <= 11174.3999f) + if (workload <= 11174.3999f) { - if(r_mk <= 0.8047f) + if (r_mk <= 0.8047f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(workload <= 7185.5999f) + if (workload <= 7185.5999f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); } @@ -385,9 +383,9 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(workload <= 17917.5000f) + if (workload <= 17917.5000f) { - if(r_mk <= 1.5078f) + if (r_mk <= 1.5078f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -398,7 +396,7 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(workload <= 34449.6016f) + if (workload <= 34449.6016f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -412,11 +410,11 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 331.1111f) + if (r_mk <= 331.1111f) { - if(workload <= 53397.5996f) + if (workload <= 53397.5996f) { - if(r_mn <= 57.8063f) + if (r_mn <= 57.8063f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -427,7 +425,7 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(r_nk <= 0.9211f) + if (r_nk <= 0.9211f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); } @@ -439,7 +437,7 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(workload <= 38070.4004f) + if (workload <= 38070.4004f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); } @@ -453,27 +451,28 @@ std::pair ClGemmDefaultConfigReshapedValha } } -std::pair ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast(m) / static_cast(n); const float r_nk = static_cast(n) / static_cast(k); const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - if(workload <= 801.6000f) + if (workload <= 801.6000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); } else { - if(r_mn <= 0.1211f) + if (r_mn <= 0.1211f) { - if(workload <= 3296.0000f) + if (workload <= 3296.0000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(r_nk <= 1.0625f) + if (r_nk <= 1.0625f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -485,15 +484,15 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(workload <= 5068.8000f) + if (workload <= 5068.8000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); } else { - if(r_nk <= 0.2361f) + if (r_nk <= 0.2361f) { - if(workload <= 12630.0000f) + if (workload <= 12630.0000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); } @@ -504,7 +503,7 @@ std::pair ClGemmDefaultConfigReshapedValha } else { - if(workload <= 178790.3984f) + if (workload <= 178790.3984f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -518,12 +517,13 @@ std::pair ClGemmDefaultConfigReshapedValha } } -std::pair ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair +ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1); } diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h index 466eda00a6..5f62efb59e 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h @@ -45,14 +45,20 @@ public: ClGemmDefaultConfigReshapedValhall(GPUTarget gpu); // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h index 1c32f1358b..83928b3f4f 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h @@ -50,7 +50,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: case GPUTarget::BIFROST: diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp index 9c23d9c998..c4825bfbeb 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp @@ -29,7 +29,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + #include namespace arm_compute @@ -47,33 +49,39 @@ ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBif { } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); - - CLGEMMConfigArray configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8); - - CLGEMMConfigArray configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); - - CLGEMMConfigArray configs_G31(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8); - - CLGEMMConfigArray configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8); - - CLGEMMConfigArray configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + using ConfigurationFunctionExecutorPtr = std::pair ( + ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + + CLGEMMConfigArray configs_G51( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8); + + CLGEMMConfigArray configs_G52( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + + CLGEMMConfigArray configs_G31( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8); + + CLGEMMConfigArray configs_G76( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8); + + CLGEMMConfigArray configs_G7x( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G76: func = configs_G76.get_function(data_type); @@ -96,14 +104,15 @@ std::pair ClGemmDefaultConfigReshapedRhsOn return (this->*func)(m, n, k, b); } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n <= 2548) + if (n <= 2548) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false); } @@ -118,12 +127,13 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); @@ -131,7 +141,7 @@ std::pair ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); - if(m >= 28) + if (m >= 28) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1); } @@ -142,7 +152,8 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -154,9 +165,9 @@ std::pair ClGemmDefaultConfigReshapedRhsOn const bool is_workload_big = ((m * n * b) / 16) >= 2048; - if(m == 1) + if (m == 1) { - if(n >= 8192) + if (n >= 8192) { const unsigned int h0 = std::max(n / 4, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false); @@ -164,7 +175,7 @@ std::pair ClGemmDefaultConfigReshapedRhsOn else { const unsigned int h0 = std::max(n / 2, 1U); - if(n <= 204) + if (n <= 204) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false); } @@ -177,25 +188,29 @@ std::pair ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast(n / 4), static_cast(16)), static_cast(1)); - if(is_workload_big) + if (is_workload_big) { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true); } else { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true); } } // Get lhs_info/rhs_info in case of OpenCL image const int h0 = std::max(std::min(static_cast(n / 4), static_cast(16)), static_cast(1)); - if(is_workload_big) + if (is_workload_big) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true); } const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); @@ -205,7 +220,7 @@ std::pair ClGemmDefaultConfigReshapedRhsOn // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true; - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) { return std::make_pair(lhs_info_img, rhs_info_img); } @@ -215,7 +230,8 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; const float r_nk = static_cast(n) / static_cast(k); @@ -225,46 +241,49 @@ std::pair ClGemmDefaultConfigReshapedRhsOn GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - if(m == 1) + if (m == 1) { - if(r_nk <= 0.4664f) + if (r_nk <= 0.4664f) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(workload <= 274.4000f) + if (workload <= 274.4000f) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int n0 = n < 1280 ? 2 : 4; const unsigned int h0 = std::max(n / n0, 1U); @@ -276,14 +295,15 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n > 2048) + if (n > 2048) { const unsigned int h0 = std::max(n / 4, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true); @@ -300,7 +320,8 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast(m) / static_cast(n); const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; @@ -312,57 +333,59 @@ std::pair ClGemmDefaultConfigReshapedRhsOn GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - if(m == 1) + if (m == 1) { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false); - if(r_mk <= 0.0026f) + if (r_mk <= 0.0026f) { - if(r_nk <= 0.4664f) + if (r_nk <= 0.4664f) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } else { - if(r_mk <= 0.0148f) + if (r_mk <= 0.0148f) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } else { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false); - if(workload <= 362.6000f) + if (workload <= 362.6000f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); } else { - if(r_mn <= 22.6067f) + if (r_mn <= 22.6067f) { - if(workload <= 708.8000f) + if (workload <= 708.8000f) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { @@ -371,27 +394,28 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(r_nk <= 0.0917f) + if (r_nk <= 0.0917f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); - if(m == 1) + if (m == 1) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); } @@ -400,15 +424,15 @@ std::pair ClGemmDefaultConfigReshapedRhsOn const float r_mn = static_cast(m) / static_cast(n); const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - if(workload <= 7449.60f) + if (workload <= 7449.60f) { - if(workload <= 691.60f) + if (workload <= 691.60f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false); } else { - if(workload <= 4155.20f) + if (workload <= 4155.20f) { return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); } @@ -420,21 +444,22 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 16300.80f) + if (workload <= 16300.80f) { - if(r_mn <= 44.56f) + if (r_mn <= 44.56f) { GEMMLHSMatrixInfo lhs_info_buf; GEMMRHSMatrixInfo rhs_info_buf; GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { @@ -448,23 +473,25 @@ std::pair ClGemmDefaultConfigReshapedRhsOn GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int n0 = n < 1280 ? 2 : 4; const unsigned int h0 = std::max(n / n0, 1U); @@ -476,14 +503,15 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); @@ -497,7 +525,7 @@ std::pair ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast(n / 2), static_cast(128)), static_cast(1)); - if(m == 1) + if (m == 1) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true); } @@ -508,12 +536,13 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); @@ -524,12 +553,13 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true); diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h index 321cbb5250..77c0c8d500 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h @@ -45,21 +45,34 @@ public: ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu); // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp index d08bf84c72..da3e2ec912 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp @@ -50,30 +50,35 @@ ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyVal { } -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair ( + ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray configs_G77( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); - CLGEMMConfigArray configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray configs_G78( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); - CLGEMMConfigArray configs_G710(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray configs_G710( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); - CLGEMMConfigArray configs_G715(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray configs_G715( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G78: func = configs_G78.get_function(data_type); @@ -96,29 +101,29 @@ std::pair ClGemmDefaultConfigReshapedRhsOn return (this->*func)(m, n, k, b); } -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { - if(m == 1) + if (m == 1) { const float r_mn = static_cast(m) / static_cast(n); const float r_mk = static_cast(m) / static_cast(k); - if(r_mk <= 0.0064484127797186375) + if (r_mk <= 0.0064484127797186375) { - if(r_mn <= 0.0028273810748942196) + if (r_mn <= 0.0028273810748942196) { GEMMLHSMatrixInfo lhs_info_buf; GEMMRHSMatrixInfo rhs_info_buf; GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - const unsigned int h0 = std::max(n / 4, 1U); + const unsigned int h0 = std::max(n / 4, 1U); std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1); std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } else { @@ -127,7 +132,7 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 0.020312500186264515) + if (r_mk <= 0.020312500186264515) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0); } @@ -143,9 +148,9 @@ std::pair ClGemmDefaultConfigReshapedRhsOn const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; const float r_mk = static_cast(m) / static_cast(k); - if(workload <= 1999.2000122070312) + if (workload <= 1999.2000122070312) { - if(workload <= 747.1999816894531) + if (workload <= 747.1999816894531) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); } @@ -159,15 +164,14 @@ std::pair ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(r_mn <= 0.03348214365541935) + if (r_mn <= 0.03348214365541935) { - if(r_mk <= 0.028125000186264515) + if (r_mk <= 0.028125000186264515) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); } @@ -181,8 +185,7 @@ std::pair ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else @@ -195,168 +198,112 @@ std::pair ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { - const GeMMConfigsMatrix configs_1nkb_best = - { - { 1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0 }, - { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0 }, - { 1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_n_small_best = - { - { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 }, - { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 }, - { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 }, - { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1 } - }; - - const GeMMConfigsMatrix configs_mnkb_n_small_fallback = - { - { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 }, - { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 }, - { 16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0 }, - { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 }, - { 369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0 }, - { 369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 }, - { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = - { - { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 }, - { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1 }, - { 49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - }; - - const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = - { - { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 }, - { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 }, + const GeMMConfigsMatrix configs_1nkb_best = { + {1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0}, {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, + {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0}, + {1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0}, {1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0}, + {1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0}, + {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1}, + {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1}, + {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1}}; + + const GeMMConfigsMatrix configs_mnkb_n_small_fallback = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0}, + {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}, + {16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0}, + {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}}; + + const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = { + {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, + {369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1}, {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, + {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, + {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0}, + {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = { + {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0}, + {369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, + {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0}, {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0}, {12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0}, + {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0}, + {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = { + {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0}, + {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1}, + {49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, }; - const GeMMConfigsMatrix configs_mnkb_squared_best = - { - { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 }, - { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 } + const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = { + {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0}, + {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0}, }; - const GeMMConfigsMatrix configs_mnkb_squared_fallback = - { - { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_best_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_fallback_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 } - }; + const GeMMConfigsMatrix configs_mnkb_squared_best = { + {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0}, + {180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1}, + {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}}; + + const GeMMConfigsMatrix configs_mnkb_squared_fallback = { + {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0}, + {180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0}, + {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}}; + + const GeMMConfigsMatrix configs_mnkb_best_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, + {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, + {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_fallback_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, + {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}}; const GeMMConfigsMatrix *configs_best_to_use = nullptr; const GeMMConfigsMatrix *configs_fallback_to_use = nullptr; - if(b == 1) + if (b == 1) { constexpr float ratio_m_gt_n = 10.f; constexpr float ratio_n_gt_m = 0.1f; constexpr unsigned int n_small_thr = 4; const float ratio = static_cast(m) / static_cast(n); - if(m == 1) + if (m == 1) { // We do not need fallback in this case, as we never use cl_image for the rhs tensor configs_best_to_use = &configs_1nkb_best; configs_fallback_to_use = &configs_1nkb_best; } - else if(n <= n_small_thr && ratio > ratio_m_gt_n) + else if (n <= n_small_thr && ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_n_small_best; configs_fallback_to_use = &configs_mnkb_n_small_fallback; } - else if(ratio > ratio_m_gt_n) + else if (ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_m_gt_n_best; configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback; } - else if(ratio < ratio_n_gt_m) + else if (ratio < ratio_n_gt_m) { configs_best_to_use = &configs_mnkb_n_gt_m_best; configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback; @@ -381,17 +328,17 @@ std::pair ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b); std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b); - return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), - std::make_pair(lhs_info1, rhs_info1), - n, k, b, DataType::F16); + return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b, + DataType::F16); } -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); @@ -399,7 +346,7 @@ std::pair ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); - if(m >= 28) + if (m >= 28) { return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1); } @@ -410,30 +357,31 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast(m) / static_cast(n); const float r_mk = static_cast(m) / static_cast(k); const float r_nk = static_cast(n) / static_cast(k); const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - if(m == 1) + if (m == 1) { - if(workload <= 278.7000f) + if (workload <= 278.7000f) { - if(workload <= 7.5000f) + if (workload <= 7.5000f) { return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); } else { - if(r_mn <= 0.0031f) + if (r_mn <= 0.0031f) { - if(workload <= 256.6000f) + if (workload <= 256.6000f) { - if(workload <= 16.7500f) + if (workload <= 16.7500f) { - if(r_nk <= 1.6671f) + if (r_nk <= 1.6671f) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); } @@ -454,15 +402,15 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 0.0027f) + if (r_mk <= 0.0027f) { - if(r_mk <= 0.0014f) + if (r_mk <= 0.0014f) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); } else { - if(workload <= 8.9500f) + if (workload <= 8.9500f) { return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); } @@ -474,13 +422,13 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 14.1500f) + if (workload <= 14.1500f) { return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); } else { - if(r_mk <= 0.0041f) + if (r_mk <= 0.0041f) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); } @@ -495,9 +443,9 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 363.7000f) + if (workload <= 363.7000f) { - if(r_mk <= 0.0031f) + if (r_mk <= 0.0031f) { return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0); } @@ -514,9 +462,9 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 1384.8000f) + if (workload <= 1384.8000f) { - if(workload <= 704.0000f) + if (workload <= 704.0000f) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0); } @@ -527,9 +475,9 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 16761.6006f) + if (workload <= 16761.6006f) { - if(r_mn <= 187.1250f) + if (r_mn <= 187.1250f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1); } @@ -540,7 +488,7 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 432.4630f) + if (r_mk <= 432.4630f) { return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1); } @@ -553,42 +501,37 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; const float r_mn = static_cast(m) / static_cast(n); const float r_mk = static_cast(m) / static_cast(k); const float r_nk = static_cast(n) / static_cast(k); - if(m == 1) + if (m == 1) { - const GeMMConfigsMatrix configs_mnkb_best = - { - { 1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 } - }; + const GeMMConfigsMatrix configs_mnkb_best = { + {1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0}, {1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0}, + {1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0}, {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}}; return find_lhs_rhs_info(configs_mnkb_best, m, n, k, b); } else { - if(workload <= 1384.8000f) + if (workload <= 1384.8000f) { - if(r_nk <= 0.8333f) + if (r_nk <= 0.8333f) { - if(r_mk <= 0.9119f) + if (r_mk <= 0.9119f) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 4, 0, 1, 0, 1, 1); } else { - if(r_nk <= 0.1181f) + if (r_nk <= 0.1181f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0); } @@ -600,7 +543,7 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 1.0013f) + if (r_mk <= 1.0013f) { return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 1); } @@ -612,11 +555,11 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 11404.7998f) + if (workload <= 11404.7998f) { - if(r_mk <= 2.2884f) + if (r_mk <= 2.2884f) { - if(r_nk <= 0.9286f) + if (r_nk <= 0.9286f) { return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 4, 0, 1, 1, 0, 1); } @@ -632,9 +575,9 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } else { - if(r_nk <= 1.1926f) + if (r_nk <= 1.1926f) { - if(r_mn <= 1385.7917f) + if (r_mn <= 1385.7917f) { return configure_lhs_rhs_info(m, n, 6, 4, 8, 1, 4, 0, 1, 1, 0, 1); } @@ -652,12 +595,13 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { unsigned int best_m0; unsigned int best_n0; - if(is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) + if (is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) { return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true); } @@ -667,153 +611,101 @@ std::pair ClGemmDefaultConfigReshapedRhsOn } } -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { - const GeMMConfigsMatrix configs_1nkb_best = - { - { 1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 } + const GeMMConfigsMatrix configs_1nkb_best = { + {1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0}, {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, + {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, + {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, + {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, + {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = { + {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1}, + {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}, {16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0}, }; - const GeMMConfigsMatrix configs_mnkb_n_small_best = - { - { 102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 } + const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = { + {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}, + {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0}, + {8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0}, {2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, + {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, + {12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0}, + {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0}, }; - const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1 }, - { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 }, - { 16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 }, - }; + const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}}; - const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0 }, - { 8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 }, - { 2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0 }, - { 3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 }, - }; + const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}}; - const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = - { - { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 } + const GeMMConfigsMatrix configs_mnkb_squared_best = { + {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, + {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1}, }; - const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = - { - { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_squared_best = - { - { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 }, - { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1 }, + const GeMMConfigsMatrix configs_mnkb_squared_fallback = { + {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, + {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, + {180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0}, }; - const GeMMConfigsMatrix configs_mnkb_squared_fallback = - { - { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0 }, - { 196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - }; + const GeMMConfigsMatrix configs_mnkb_best_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1}, {4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1}, {24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}}; - const GeMMConfigsMatrix configs_mnkb_best_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1 }, - { 24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 } - }; - - const GeMMConfigsMatrix configs_mnkb_fallback_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 }, - { 5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 } - }; + const GeMMConfigsMatrix configs_mnkb_fallback_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0}, + {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0}, + {112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0}, + {1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}}; const GeMMConfigsMatrix *configs_best_to_use = nullptr; const GeMMConfigsMatrix *configs_fallback_to_use = nullptr; - if(b == 1) + if (b == 1) { constexpr float ratio_m_gt_n = 10.f; constexpr float ratio_n_gt_m = 0.1f; constexpr unsigned int n_small_thr = 4; const float ratio = static_cast(m) / static_cast(n); - if(m == 1) + if (m == 1) { // We do not need fallback in this case, as we never use cl_image for the rhs tensor configs_best_to_use = &configs_1nkb_best; configs_fallback_to_use = &configs_1nkb_best; } - else if(n <= n_small_thr && ratio > ratio_m_gt_n) + else if (n <= n_small_thr && ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_n_small_best; configs_fallback_to_use = &configs_mnkb_n_small_best; } - else if(ratio > ratio_m_gt_n) + else if (ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_m_gt_n_best; configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback; } - else if(ratio < ratio_n_gt_m) + else if (ratio < ratio_n_gt_m) { configs_best_to_use = &configs_mnkb_n_gt_m_best; configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback; @@ -838,17 +730,17 @@ std::pair ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b); std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b); - return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), - std::make_pair(lhs_info1, rhs_info1), - n, k, b, DataType::F16); + return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b, + DataType::F16); } -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { unsigned int best_m0; unsigned int best_n0; - if(is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) + if (is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) { return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true); } diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h index f2952a3d30..a0ea337eb1 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h @@ -45,17 +45,26 @@ public: ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu); // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair + configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h index 1503e74eb6..e07ad993ed 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h @@ -50,7 +50,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: case GPUTarget::BIFROST: diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp index 2407c6ca5e..689a743fdf 100644 --- a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp +++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp @@ -36,7 +36,9 @@ namespace opencl { namespace kernels { -Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info) +Status validate_matmul_input_shapes(const TensorShape &lhs_shape, + const TensorShape &rhs_shape, + const MatMulKernelInfo &matmul_kernel_info) { const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x(); const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y(); @@ -46,7 +48,7 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty"); constexpr size_t batch_dim_start = 2; - for(size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported"); } @@ -54,9 +56,12 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh return Status{}; } -std::pair validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs, - const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, - int mmul_m0, int mmul_n0) +std::pair validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, + int mmul_m0, + int mmul_n0) { ARM_COMPUTE_UNUSED(lhs, rhs); diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h index 210f22b109..c2ae2a67f4 100644 --- a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h +++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h @@ -44,7 +44,8 @@ namespace kernels * * @return true if the shapes and matmul kernel info matches */ -Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, +Status validate_matmul_input_shapes(const TensorShape &lhs_shape, + const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info); /** Validate and configure window for Matmul MMUL kernels @@ -58,9 +59,12 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh * * @return a pair of Status and Window object */ -std::pair validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs, - const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, - int mmul_m0, int mmul_n0); +std::pair validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, + int mmul_m0, + int mmul_n0); } // namespace kernels } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp index 74a818d738..66877ebcec 100644 --- a/src/gpu/cl/operators/ClActivation.cpp +++ b/src/gpu/cl/operators/ClActivation.cpp @@ -23,19 +23,21 @@ */ #include "src/gpu/cl/operators/ClActivation.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClActivationKernel.h" - #include "src/common/IOperator.h" #include "src/common/utils/LegacySupport.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/ClContext.h" +#include "src/gpu/cl/kernels/ClActivationKernel.h" namespace arm_compute { namespace opencl { -void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClActivation::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src, dst, act_info); auto k = std::make_unique(); @@ -53,13 +55,17 @@ namespace gpu { namespace opencl { -std::tuple ClContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) +std::tuple ClContext::create_activation(const AclTensorDescriptor &src, + const AclTensorDescriptor &dst, + const AclActivationDescriptor &act, + bool is_validate) { TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst); auto info = detail::convert_to_activation_info(act); - if(is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) + if (is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), + &dst_info.set_is_resizable(false), info))) { return std::make_tuple(nullptr, StatusCode::UnsupportedConfig); } @@ -68,7 +74,7 @@ std::tuple ClContext::create_activation(const AclTensor act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info); auto op = new arm_compute::IOperator(static_cast(this)); - if(op == nullptr) + if (op == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); return std::make_tuple(nullptr, StatusCode::OutOfMemory); diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h index 348dc27929..4f25bb5f24 100644 --- a/src/gpu/cl/operators/ClActivation.h +++ b/src/gpu/cl/operators/ClActivation.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_ACTIVATION_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -43,7 +44,10 @@ public: * @param[out] dst Destination tensor info. Data type supported: same as @p src * @param[in] activation_info Activation layer parameters. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ActivationLayerInfo &activation_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClActivation::configure() diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp index b9bf505bba..b58d0df58d 100644 --- a/src/gpu/cl/operators/ClAdd.cpp +++ b/src/gpu/cl/operators/ClAdd.cpp @@ -23,17 +23,20 @@ */ #include "src/gpu/cl/operators/ClAdd.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClElementwiseKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +void ClAdd::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info); auto k = std::make_unique(); @@ -41,8 +44,11 @@ void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1 _kernel = std::move(k); } -Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status ClAdd::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info); } diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h index a17ce7b5d6..7aed902f5d 100644 --- a/src/gpu/cl/operators/ClAdd.h +++ b/src/gpu/cl/operators/ClAdd.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_ADD_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -65,7 +66,11 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -73,7 +78,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy, + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp index 05ea21b734..8f26ef003d 100644 --- a/src/gpu/cl/operators/ClCast.cpp +++ b/src/gpu/cl/operators/ClCast.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClCast.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClCastKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) +void ClCast::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + ConvertPolicy policy) { ARM_COMPUTE_LOG_PARAMS(src, dst, policy); auto k = std::make_unique(); diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h index 1b67ff7c8e..25d2293673 100644 --- a/src/gpu/cl/operators/ClCast.h +++ b/src/gpu/cl/operators/ClCast.h @@ -58,7 +58,8 @@ public: * @param[out] dst The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. * @param[in] policy Conversion policy. */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + void + configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCast::configure() diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp index a27fc37cc4..31018b9768 100644 --- a/src/gpu/cl/operators/ClConcatenate.cpp +++ b/src/gpu/cl/operators/ClConcatenate.cpp @@ -23,9 +23,14 @@ */ #include "src/gpu/cl/operators/ClConcatenate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h" #include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h" #include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h" @@ -33,42 +38,39 @@ #include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h" #include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" - -#include "src/common/utils/Log.h" -#include "src/core/helpers/AutoConfiguration.h" - namespace arm_compute { namespace opencl { -void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector &src_vector, ITensorInfo *dst, size_t axis) +void ClConcatenate::configure(const CLCompileContext &compile_context, + const std::vector &src_vector, + ITensorInfo *dst, + size_t axis) { ARM_COMPUTE_ERROR_ON(dst == nullptr); ARM_COMPUTE_LOG_PARAMS(src_vector, dst, axis); _axis = axis; _num_inputs = src_vector.size(); - TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis); + TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis); std::vector const_src_vector(src_vector.size()); - std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t; - }); + std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), + [](ITensorInfo *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t; + }); // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type()); ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis)); unsigned int offset = 0; - switch(_axis) + switch (_axis) { case Window::DimX: { - switch(_num_inputs) + switch (_num_inputs) { case 2: { @@ -82,14 +84,15 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std { // Configure WidthConcatenate4Tensors kernel auto kernel = std::make_unique(); - kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst); + kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), + src_vector.at(3), dst); _concat_kernels.emplace_back(std::move(kernel)); break; } default: { // Configure generic case WidthConcatenate kernels - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -103,7 +106,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std } case Window::DimY: { - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -114,7 +117,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std } case Window::DimZ: { - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -125,7 +128,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std } case 3: { - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -148,25 +151,27 @@ Status ClConcatenate::validate(const std::vector &src_vecto ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2); unsigned int offset = 0; - switch(axis) + switch (axis) { case Window::DimX: { - switch(num_inputs) + switch (num_inputs) { case 2: // Validate WidthConcatenate2Tensors kernels if there are 2 inputs ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst)); break; case 4: // Validate WidthConcatenate4Tensors kernels if there are 4 inputs ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate( + src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst)); break; default: // Validate generic case of WidthConcatenate kernel - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst)); @@ -178,7 +183,7 @@ Status ClConcatenate::validate(const std::vector &src_vecto } case Window::DimY: { - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst)); offset += src->dimension(axis); @@ -187,7 +192,7 @@ Status ClConcatenate::validate(const std::vector &src_vecto } case Window::DimZ: { - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst)); offset += src->dimension(axis); @@ -196,7 +201,7 @@ Status ClConcatenate::validate(const std::vector &src_vecto } case 3: { - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst)); offset += src->dimension(axis); @@ -207,7 +212,7 @@ Status ClConcatenate::validate(const std::vector &src_vecto ARM_COMPUTE_ERROR("Axis not supported"); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis); ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); @@ -218,17 +223,17 @@ Status ClConcatenate::validate(const std::vector &src_vecto void ClConcatenate::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } - if(static_cast(tensors.size()) - 1 != static_cast(_num_inputs)) + if (static_cast(tensors.size()) - 1 != static_cast(_num_inputs)) { ARM_COMPUTE_ERROR("Configured with different number of inputs"); } - if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4)) + if (_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4)) { ARM_COMPUTE_ERROR_ON(_concat_kernels.empty()); CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true); @@ -236,7 +241,7 @@ void ClConcatenate::run(ITensorPack &tensors) else { int i = 0; - for(auto &k : _concat_kernels) + for (auto &k : _concat_kernels) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h index de0cf84d2c..d8ce9d2a5c 100644 --- a/src/gpu/cl/operators/ClConcatenate.h +++ b/src/gpu/cl/operators/ClConcatenate.h @@ -57,7 +57,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src_vector. * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. */ - void configure(const ClCompileContext &compile_context, const std::vector &src_vector, ITensorInfo *dst, size_t axis); + void configure(const ClCompileContext &compile_context, + const std::vector &src_vector, + ITensorInfo *dst, + size_t axis); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClConcatenate::configure() @@ -71,8 +74,8 @@ public: private: std::vector> _concat_kernels{}; - unsigned int _num_inputs{ 0 }; - unsigned int _axis{ 0 }; + unsigned int _num_inputs{0}; + unsigned int _axis{0}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp index eb9475ccaa..2c3b0214fa 100644 --- a/src/gpu/cl/operators/ClConv2d.cpp +++ b/src/gpu/cl/operators/ClConv2d.cpp @@ -23,17 +23,17 @@ */ #include "src/gpu/cl/operators/ClConv2d.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h" + +#include "src/common/utils/Log.h" #include "src/gpu/cl/operators/ClDirectConv2d.h" #include "src/gpu/cl/operators/ClGemmConv2d.h" #include "src/gpu/cl/operators/ClIndirectConv2d.h" #include "src/gpu/cl/operators/ClWinogradConv2d.h" -#include "src/common/utils/Log.h" - #include namespace @@ -48,7 +48,7 @@ namespace */ size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target) { - switch(gpu_target) + switch (gpu_target) { case arm_compute::GPUTarget::G76: case arm_compute::GPUTarget::G77: @@ -71,27 +71,33 @@ namespace opencl { using namespace arm_compute::misc::shape_calculator; -ClConv2d::ClConv2d() - : _operator() +ClConv2d::ClConv2d() : _operator() { } ClConv2d::~ClConv2d() = default; -void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info) +void ClConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info)); + ARM_COMPUTE_ERROR_THROW_ON( + ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info); - switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target())) + switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target())) { case ConvolutionMethod::WINOGRAD: { ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1); auto f = std::make_unique(); - f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math); + f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, + conv2d_info.enable_fast_math); _operator = std::move(f); break; } @@ -125,35 +131,46 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s _aux_mem = _operator->workspace(); } -Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, +Status ClConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); const GPUTarget gpu_target = CLScheduler::get().target(); - switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target)) + switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target)) { case ConvolutionMethod::WINOGRAD: { //Validate Winograd - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClWinogradConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, + conv2d_info.act_info, conv2d_info.enable_fast_math)); break; } case ConvolutionMethod::DIRECT: { // Validate direct convolution layer - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClDirectConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR( + ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); break; } case ConvolutionMethod::INDIRECT: { // Validate indirect convolution layer - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClIndirectConv2d is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClIndirectConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR( + ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); break; } case ConvolutionMethod::GEMM: @@ -170,8 +187,12 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co return Status{}; } -ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info, const GPUTarget gpu_target) +ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info, + const GPUTarget gpu_target) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_ERROR_ON_NULLPTR(dst); @@ -191,20 +212,35 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const using ConvolutionConfiguration = std::tuple; using ConfigurationMethod = std::pair; - const std::vector known_configs = - { + const std::vector known_configs = { // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), + PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), + ConvolutionMethod::DIRECT), // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), + ConvolutionMethod::DIRECT), // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), + ConvolutionMethod::GEMM), // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), + ConvolutionMethod::GEMM), // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), + ConvolutionMethod::GEMM), // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), + ConvolutionMethod::GEMM), }; const auto find_config = [&](ConfigurationMethod c) @@ -213,76 +249,89 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const const PadStrideInfo info = std::get<3>(config); const DataLayout data_layout = std::get<4>(config); - return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == src->data_layout()); + return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && + std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride() && (data_layout == src->data_layout()); }; std::vector::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) { return (*found).second; } - if(dilation != Size2D(1U, 1U)) + if (dilation != Size2D(1U, 1U)) { return ConvolutionMethod::GEMM; } else { - if(src->data_layout() == DataLayout::NCHW) + if (src->data_layout() == DataLayout::NCHW) { // SRGAN - if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3) - && (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info))) + if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && + (conv_info.pad_top() < 3) && + (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info))) { return ConvolutionMethod::DIRECT; } - if((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))) + if ((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && + (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))) { return ConvolutionMethod::FFT; } - if(src->dimension(idx_c) < 16) + if (src->dimension(idx_c) < 16) { return ConvolutionMethod::GEMM; } - return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM; + return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) + ? ConvolutionMethod::WINOGRAD + : ConvolutionMethod::GEMM; } else { - const bool is_direct_valid = bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); - const bool is_wino_valid = bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)); + const bool is_direct_valid = + bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); + const bool is_wino_valid = + bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)); const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target); // SRGAN case - if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3) - && is_direct_valid) + if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && + (conv_info.pad_top() < 3) && is_direct_valid) { return ConvolutionMethod::DIRECT; } // Floating-point case: GeMM/Direct/Winograd - if(is_data_type_float(src->data_type())) + if (is_data_type_float(src->data_type())) { // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); - const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); - const bool is_ifm_ge_8 = src->dimension(idx_c) >= 8; - const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; - const bool is_ofm_lte_8 = weights->dimension(3U) <= 8; - const bool is_ofm_lt_64 = weights->dimension(3U) < 64; - const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192; - const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); - const bool is_m_one = output_shape[1] * output_shape[2] == 1; - const bool is_unit_stride = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1); - const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h); + TensorShape output_shape = + misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); + const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && + (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); + const bool is_ifm_ge_8 = src->dimension(idx_c) >= 8; + const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; + const bool is_ofm_lte_8 = weights->dimension(3U) <= 8; + const bool is_ofm_lt_64 = weights->dimension(3U) < 64; + const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192; + const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); + const bool is_m_one = output_shape[1] * output_shape[2] == 1; + const bool is_unit_stride = + (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1); + const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h); // Run Winograd if valid and IFM >= 8 - if(is_wino_valid && is_ifm_ge_8) + if (is_wino_valid && is_ifm_ge_8) { - if(is_ofm_lte_8) + if (is_ofm_lte_8) { - if(gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD) + if (gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || + get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD) { return ConvolutionMethod::WINOGRAD; } @@ -294,18 +343,19 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const } // Direct convolution case - if(is_direct_valid) + if (is_direct_valid) { - if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)) + if ((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || + get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)) { - if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm) + if (is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm) { return ConvolutionMethod::DIRECT; } } - else if(gpu_target == arm_compute::GPUTarget::G76) + else if (gpu_target == arm_compute::GPUTarget::G76) { - if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16)) + if ((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16)) { return ConvolutionMethod::DIRECT; } @@ -314,21 +364,24 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const { ConvolutionMethod preferred_conv_method = ConvolutionMethod::DIRECT; - const bool is_indirect_valid = bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); + const bool is_indirect_valid = + bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); // indirect conv2d should be called when: // 1- When the kernel size is greater than 1x1 and less than or equal to 9x9 (81) // 2- When the kernel size is odd // 3- When the Gpu target is Arm Mali-G77 - if(is_indirect_valid) + if (is_indirect_valid) { const bool is_kernel_sz_odd = kernel_sz % 2; const bool is_g77 = gpu_target == GPUTarget::G77; - preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 ? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT; + preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 + ? ConvolutionMethod::INDIRECT + : ConvolutionMethod::DIRECT; } // Direct/indirect convolution used for the first layer of the network - if(workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64) + if (workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64) { // In general, the question we should ask for the first convolution layer of a model is: // when the execution time of im2col + gemm < direct?. Since im2col does not depend on the OFM, it means that @@ -337,13 +390,13 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const return preferred_conv_method; } - if((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16) + if ((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16) { return preferred_conv_method; } // Direct convolution used for the last layer of the network - if(is_ofm_lte_8) + if (is_ofm_lte_8) { return preferred_conv_method; } diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h index c6c366a762..0cf3cbc1ce 100644 --- a/src/gpu/cl/operators/ClConv2d.h +++ b/src/gpu/cl/operators/ClConv2d.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" #include "src/gpu/cl/IClOperator.h" @@ -112,15 +113,24 @@ public: * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info = WeightsInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d * * Similar to ClConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will return the convolution called by @ref ClConv2d * @@ -137,11 +147,15 @@ public: * * @return the Convolution Method Hint */ - static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info, const GPUTarget gpu_target); + static ConvolutionMethod get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info, + const GPUTarget gpu_target); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp index 08122b6852..cf24c68d21 100644 --- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp +++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp @@ -23,16 +23,19 @@ */ #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout); auto k = std::make_unique(); @@ -40,9 +43,12 @@ void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_c _kernel = std::move(k); } -Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h index 2794eb17b0..c46152081c 100644 --- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h +++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h @@ -43,14 +43,21 @@ public: * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClConvertFullyConnectedWeights::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp index d3b83040d0..e2be7cebd4 100644 --- a/src/gpu/cl/operators/ClCopy.cpp +++ b/src/gpu/cl/operators/ClCopy.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClCopy.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClCopyKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl @@ -45,4 +44,4 @@ Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window * return kernels::ClCopyKernel::validate(src, dst, dst_window); } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h index 9b427f9675..fe9b58c607 100644 --- a/src/gpu/cl/operators/ClCopy.h +++ b/src/gpu/cl/operators/ClCopy.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_COPY_H #include "arm_compute/core/Window.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -44,7 +45,10 @@ public: * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + Window *dst_window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCopy::configure() diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp index cef9f14c7d..6313e4fbb5 100644 --- a/src/gpu/cl/operators/ClCrop.cpp +++ b/src/gpu/cl/operators/ClCrop.cpp @@ -23,17 +23,22 @@ */ #include "src/gpu/cl/operators/ClCrop.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClCropKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) +void ClCrop::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window); auto k = std::make_unique(); @@ -41,9 +46,15 @@ void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInf _kernel = std::move(k); } -Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) +Status ClCrop::validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window); } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h index 1cf1c9bff4..e845cf372c 100644 --- a/src/gpu/cl/operators/ClCrop.h +++ b/src/gpu/cl/operators/ClCrop.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_CROP_H #include "arm_compute/core/Window.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -49,16 +50,27 @@ public: * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCrop::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp index 0fccab63e0..eb6f9e7abb 100644 --- a/src/gpu/cl/operators/ClDequantize.cpp +++ b/src/gpu/cl/operators/ClDequantize.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClDequantizeKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClDequantizeKernel.h" namespace arm_compute { diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp index 0215dba422..17a196ce6b 100644 --- a/src/gpu/cl/operators/ClDirectConv2d.cpp +++ b/src/gpu/cl/operators/ClDirectConv2d.cpp @@ -26,6 +26,8 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/gpu/cl/kernels/ClActivationKernel.h" @@ -35,8 +37,6 @@ #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h" #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" -#include "src/common/utils/Log.h" - using namespace arm_compute::cl_direct_conv; namespace arm_compute @@ -53,7 +53,8 @@ ITensorPack select_activation_src_dst(ITensorPack &tensors) return pack; } -DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo +config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) { // Get GPU target GPUTarget gpu_target = CLScheduler::get().target(); @@ -65,8 +66,13 @@ DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *sr } // namespace -void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void ClDirectConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info); @@ -75,15 +81,17 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info); // Configure direct convolution kernel - const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo(); - auto k = std::make_unique(); + const ActivationLayerInfo conv2d_act_info = + (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info + : ActivationLayerInfo(); + auto k = std::make_unique(); k->set_target(CLScheduler::get().target()); k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info, desc); _direct_conv_kernel = std::move(k); // Configure border handler PixelValue zero_value(0.f); - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { zero_value = PixelValue(0, src->data_type(), src->quantization_info()); } @@ -92,7 +100,7 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI _src_border_handler = std::move(b); // Fused activation is currently supported for NHWC and floating point types - if(act_info.enabled() && !conv2d_act_info.enabled()) + if (act_info.enabled() && !conv2d_act_info.enabled()) { auto a = std::make_unique(); a->configure(compile_context, dst, dst, act_info); @@ -103,14 +111,19 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI CLScheduler::get().tune_kernel_static(*_direct_conv_kernel); } -Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +Status ClDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { // Initialize the direct convolution descriptor const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc)); - if(act_info.enabled()) + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc)); + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info)); } @@ -124,7 +137,7 @@ void ClDirectConv2d::run(ITensorPack &tensors) // Run direct convolution CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false); // Run activation kernel - if(_activation_kernel) + if (_activation_kernel) { auto act_pack = select_activation_src_dst(tensors); CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false); diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h index fedb9e971e..0f18490814 100644 --- a/src/gpu/cl/operators/ClDirectConv2d.h +++ b/src/gpu/cl/operators/ClDirectConv2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_DIRECT_CONV2D_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" #include "src/gpu/cl/IClOperator.h" @@ -59,7 +60,12 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -67,16 +73,20 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited method overridden void run(ITensorPack &tensors) override; private: - std::unique_ptr _direct_conv_kernel{ nullptr }; - std::unique_ptr _src_border_handler{ nullptr }; - std::unique_ptr _activation_kernel{ nullptr }; + std::unique_ptr _direct_conv_kernel{nullptr}; + std::unique_ptr _src_border_handler{nullptr}; + std::unique_ptr _activation_kernel{nullptr}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClDirectConv3d.cpp b/src/gpu/cl/operators/ClDirectConv3d.cpp index 5d37f07f31..b08347936b 100644 --- a/src/gpu/cl/operators/ClDirectConv3d.cpp +++ b/src/gpu/cl/operators/ClDirectConv3d.cpp @@ -24,13 +24,19 @@ #include "src/gpu/cl/operators/ClDirectConv3d.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/gpu/cl/kernels/ClDirectConv3dKernel.h" namespace arm_compute { namespace opencl { -void ClDirectConv3d::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info) +void ClDirectConv3d::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0); @@ -40,7 +46,11 @@ void ClDirectConv3d::configure(const CLCompileContext &compile_context, const IT _direct_conv3d_kernel = std::move(k); } -Status ClDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info) +Status ClDirectConv3d::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv3dKernel::validate(src0, src1, src2, dst, conv3d_info)); return Status{}; diff --git a/src/gpu/cl/operators/ClDirectConv3d.h b/src/gpu/cl/operators/ClDirectConv3d.h index fa58b5aedd..5fb32460e2 100644 --- a/src/gpu/cl/operators/ClDirectConv3d.h +++ b/src/gpu/cl/operators/ClDirectConv3d.h @@ -67,7 +67,12 @@ public: * @param[in] conv3d_info Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused. * */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info); /** Static function to check if given info will lead to a valid configuration * @@ -75,14 +80,18 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info); // Inherited method overridden void run(ITensorPack &tensors) override; private: - std::unique_ptr _direct_conv3d_kernel{ nullptr }; + std::unique_ptr _direct_conv3d_kernel{nullptr}; }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */ diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp index 32d2b88798..1325371d19 100644 --- a/src/gpu/cl/operators/ClElementwiseOperations.cpp +++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp @@ -23,15 +23,18 @@ */ #include "src/gpu/cl/operators/ClElementwiseOperations.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" namespace arm_compute { namespace opencl { -void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseDivision::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique(); @@ -39,12 +42,19 @@ void ClElementwiseDivision::configure(const ClCompileContext &compile_context, I _kernel = std::move(k); } -Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseDivision::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info); } -void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseMax::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique(); @@ -52,12 +62,19 @@ void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITenso _kernel = std::move(k); } -Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseMax::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info); } -void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseMin::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique(); @@ -65,12 +82,19 @@ void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITenso _kernel = std::move(k); } -Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseMin::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info); } -void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique(); @@ -78,12 +102,19 @@ void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context _kernel = std::move(k); } -Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info); } -void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwisePower::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique(); @@ -91,7 +122,10 @@ void ClElementwisePower::configure(const ClCompileContext &compile_context, ITen _kernel = std::move(k); } -Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwisePower::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info); } diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h index 120049cb7f..de7c018d75 100644 --- a/src/gpu/cl/operators/ClElementwiseOperations.h +++ b/src/gpu/cl/operators/ClElementwiseOperations.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -48,14 +49,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseDivision::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max @@ -74,14 +82,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseMax::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min @@ -100,14 +115,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseMin::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference @@ -126,14 +148,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseSquaredDiff::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power @@ -152,14 +181,21 @@ public: * @param[out] dst Destination tensor info. Data types supported:F16/F32. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwisePower::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp index f94d402c05..914621183e 100644 --- a/src/gpu/cl/operators/ClElementwiseUnary.cpp +++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp @@ -23,9 +23,8 @@ */ #include "src/gpu/cl/operators/ClElementwiseUnary.h" -#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" - #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" namespace arm_compute { diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp index ad22b15cff..817b15ab20 100644 --- a/src/gpu/cl/operators/ClFill.cpp +++ b/src/gpu/cl/operators/ClFill.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClFill.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClFillKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window) +void ClFill::configure(const ClCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *dst_window) { ARM_COMPUTE_LOG_PARAMS(tensor, constant_value, dst_window); auto k = std::make_unique(); @@ -45,4 +47,4 @@ Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_va return kernels::ClFillKernel::validate(tensor, constant_value, dst_window); } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h index 3bbe27ef71..e13862aa6b 100644 --- a/src/gpu/cl/operators/ClFill.h +++ b/src/gpu/cl/operators/ClFill.h @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Window.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -44,7 +45,10 @@ public: * @param[in] constant_value The value used to fill the planes of the tensor * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); + void configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to ClFill::configure() diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp index e277c0d7e4..7532532c94 100644 --- a/src/gpu/cl/operators/ClFlatten.cpp +++ b/src/gpu/cl/operators/ClFlatten.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClFlatten.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClReshapeKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp index 84f685e381..6790160172 100644 --- a/src/gpu/cl/operators/ClFloor.cpp +++ b/src/gpu/cl/operators/ClFloor.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClFloor.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClFloorKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp index 5845bbc69e..6969ac8ab3 100644 --- a/src/gpu/cl/operators/ClFullyConnected.cpp +++ b/src/gpu/cl/operators/ClFullyConnected.cpp @@ -24,12 +24,13 @@ #include "src/gpu/cl/operators/ClFullyConnected.h" #include "arm_compute/core/Size2D.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" #include "src/gpu/cl/operators/ClFlatten.h" @@ -38,11 +39,8 @@ #include "src/gpu/cl/operators/ClMatMul.h" #include "src/gpu/cl/operators/ClTranspose.h" #include "src/gpu/cl/utils/ClAuxTensorHandler.h" - #include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h" #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" #include @@ -62,8 +60,11 @@ inline TensorShape get_reshaped_matmul_tensor(const TensorShape &src) return TensorShape(src.x(), 1, src.y(), src.collapsed_from(2).z()); // Return value optimisation } -Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst, - GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info) +Status construct_gemmlowp_output_stage(const ITensorInfo &src, + const ITensorInfo &weights, + const ITensorInfo &dst, + GEMMLowpOutputStageInfo &gemmlowp_output_stage, + ActivationLayerInfo activation_info) { gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; gemmlowp_output_stage.gemmlowp_offset = 0; @@ -73,7 +74,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo const auto data_type = src.data_type(); // Configure output stage for quantized case - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { const QuantizationInfo oq_info = dst.quantization_info(); const UniformQuantizationInfo iq_unif = src.quantization_info().uniform(); @@ -85,15 +86,17 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); PixelValue type_min{}; PixelValue type_max{}; std::tie(type_min, type_max) = get_min_max(data_type); - if(activation_info.enabled()) + if (activation_info.enabled()) { - std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info); + std::tie(type_min, type_max) = + get_quantized_activation_min_max(activation_info, data_type, output_quant_info); } // Set the GEMMLowp output stage info @@ -109,31 +112,41 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo return Status{}; } -Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info, bool use_matmul) +Status validate_mm(const ITensorInfo &src, + const ITensorInfo &weights, + const ITensorInfo *bias, + const ITensorInfo &dst, + const FullyConnectedLayerInfo &fc_info, + bool use_matmul) { // Note : If input is dynamic and data is not batched, use matmul, else use gemm const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; - const bool use_dynamic_gemm = !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul - const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type()); + const bool use_dynamic_gemm = + !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul + const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type()); - if(use_matmul) + if (use_matmul) { const MatMulInfo m_info = MatMulInfo().adj_rhs(transpose_weights); // Note: LHS is reshaped here to match ClMatMul expectations of batch index - From [M, B0, B1] to [M, 1, B0, B1] TensorInfo lhs_to_use = src.clone()->set_tensor_shape(get_reshaped_matmul_tensor(src.tensor_shape())); - const GPUTarget gpu_target = CLScheduler::get().target(); - std::unique_ptr t = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); - const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info); + const GPUTarget gpu_target = CLScheduler::get().target(); + std::unique_ptr t = + cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); + const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info); - return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info) : - kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info); + return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, + kernel_info, fc_info.activation_info) + : kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, + fc_info.activation_info); } else { GEMMLowpOutputStageInfo gemmlowp_output_stage; - ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info)); const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped false, // is_b_reshaped @@ -147,7 +160,7 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe true, // broadcast_bias ActivationLayerInfo()); // activation_info - if(is_quantized) + if (is_quantized) { const UniformQuantizationInfo iq_info = src.quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); @@ -158,11 +171,9 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset); // Validate gemmlowp function - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), - bias, - &dst, - gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate( + &src.clone()->set_quantization_info(src_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), bias, &dst, gemm_info)); } else { @@ -188,11 +199,15 @@ ClFullyConnected::ClFullyConnected() ClFullyConnected::~ClFullyConnected() = default; -void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, +void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info) { // If weights are dynamic and matmul is supported use matmul, else use gemm - if(_use_matmul) + if (_use_matmul) { // Specify whether transpose weights is necessary in matmul info const MatMulInfo mat_info = MatMulInfo().adj_rhs(_transpose_weights); @@ -202,22 +217,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe _lhs_to_use = src->clone()->set_tensor_shape(get_reshaped_matmul_tensor(_lhs_to_use.tensor_shape())); // 2. Use heuristics to get kernel info object - const GPUTarget gpu_target = CLScheduler::get().target(); - std::unique_ptr kernel_config = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); - MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info); + const GPUTarget gpu_target = CLScheduler::get().target(); + std::unique_ptr kernel_config = + cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); + MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info); // 3. Configure relevant matmul kernel - if(_is_quantized) + if (_is_quantized) { _matmul_lowp_native_kernel = std::make_unique(); _matmul_lowp_native_kernel->set_target(gpu_target); - _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info); + _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, + fc_info.activation_info); } else { _matmul_native_kernel = std::make_unique(); _matmul_native_kernel->set_target(gpu_target); - _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info); + _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, + fc_info.activation_info); } } else @@ -238,7 +256,7 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe true, // broadcast_bias fc_info.activation_info); // activation_info - if(_is_quantized) + if (_is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -248,8 +266,10 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); - src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset)); - weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + src_info.set_quantization_info( + QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset)); + weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, + -weights_quantization_info.uniform().offset)); // Configure gemmlowp function _mm_gemmlowp = std::make_unique(); @@ -264,16 +284,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe } } -void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, +void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info) { // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate. - ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) != + (src->dimension(0) * src->dimension(1) * src->dimension(2)))); // If the fully connected layer is called after a convolution layer, the input tensor must be linearized // Initialize output tensor for flatten - _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW); + _flattened_src = src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(src)) + .set_data_layout(DataLayout::NCHW); // Configure flatten kernel _flatten = std::make_unique(); @@ -284,7 +313,11 @@ void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info); } -void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, +void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info) { // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate. @@ -294,7 +327,11 @@ void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, configure_mm(compile_context, src, weights, bias, dst, fc_info); } -void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, +void ClFullyConnected::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, FullyConnectedLayerInfo fc_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); @@ -317,8 +354,9 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso // 2. MatMul does not support broadcasting batch dimension, and therefore is disabled if fc is batched. // 3. When FC is after convolution and src tensor data layout does not match weights trained data layout (weights conversion kernel is required) const bool is_batched_fc_layer = dst->dimension(1) > 1; - _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); - _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul; + _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && + !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); + _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul; // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -327,11 +365,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso // 4) Fully Connected layer -> Fully Connected layer with batches // Check if we have a fully connected layer with batches - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, - src->tensor_shape().cend(), - dst->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { @@ -341,7 +379,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso ITensorInfo *weights_used = weights; // Reshape weights if needed - Not needed when matmul is in use as matmul fuses transpose op. - if(_transpose_weights && !_use_matmul) + if (_transpose_weights && !_use_matmul) { // Reshape the weights _reshape_weights = std::make_unique(); @@ -351,14 +389,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso } // Convert weights if needed - if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Convert weights _convert_weights = std::make_unique(); - _convert_weights->configure(compile_context, - weights_used, - &_converted_weights, - src->tensor_shape(), + _convert_weights->configure(compile_context, weights_used, &_converted_weights, src->tensor_shape(), fc_info.weights_trained_layout); weights_used = &_converted_weights; @@ -366,7 +401,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso _run_convert_weights = true; } - if(_is_fc_after_conv) + if (_is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info); @@ -379,60 +414,69 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion) _weights_to_use = *weights_used; - if(_use_matmul) + if (_use_matmul) { // Note : MatMul does not use transpose and does not need auxillary memory, so only converted weights are added to aux_mem - _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size()); + _aux_mem[ConvertedWeights] = + MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size()); } else { // Set auxiliary memory requirements for gemm operators auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); - for(unsigned int i = 0; i < gemm_mem_req.size(); ++i) + for (unsigned int i = 0; i < gemm_mem_req.size(); ++i) { _aux_mem[i] = gemm_mem_req[i]; } - if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs + if (_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs { // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time _aux_mem[TransposedWeights] = MemoryInfo( - offset_int_vec(TransposedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, - _reshaped_weights.total_size()); - _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, - _converted_weights.total_size()); + offset_int_vec(TransposedWeights), _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _converted_weights.total_size()); } else { // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch - const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare; - const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare; - - _aux_mem[TransposedWeights] = MemoryInfo( - offset_int_vec(TransposedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft, - _reshaped_weights.total_size()); - _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft, - _converted_weights.total_size()); + const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) + ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare; + const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) + ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare; + + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft, + _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft, + _converted_weights.total_size()); } } - _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); + _aux_mem[FlattenedSrc] = + MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); } -Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, +Status ClFullyConnected::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, FullyConnectedLayerInfo fc_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU - && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON( + fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target()); const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; @@ -441,11 +485,20 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei // When using dynamic weights - use matmul kernels. // Note: MatMul does not support broadcasting so fallback with batched cases. const bool is_batched_fc_layer = dst->dimension(1) > 1; - const bool use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); - - const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW)); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = (transpose_weights && !use_matmul) ? TensorInfo(*reshaped_weights.clone()) : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()); + const bool use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && + !is_batched_fc_layer && + !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); + + const ITensorInfo &flatten_src = TensorInfo(src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(src)) + .set_data_layout(DataLayout::NCHW)); + const ITensorInfo &reshaped_weights = TensorInfo( + weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = (transpose_weights && !use_matmul) + ? TensorInfo(*reshaped_weights.clone()) + : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()); // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -456,10 +509,10 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei const ITensorInfo *src_to_use = src; const ITensorInfo *weights_to_use = weights; - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -470,11 +523,11 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei } // Check if FC is after conv (flatten kernel is run in case where FC is after conv.) - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, - src->tensor_shape().cend(), - dst->tensor_shape().cbegin() + 1)); + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { @@ -482,29 +535,28 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei } // Transpose kernel does not run when matmul is supported as matmul fuses transpose op. - if(transpose_weights && !use_matmul) + if (transpose_weights && !use_matmul) { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } - if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - src->tensor_shape(), - fc_info.weights_trained_layout)); + ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } - if(is_fc_after_conv) + if (is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1; - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); // Validate flatten kernel ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src)); @@ -539,24 +591,24 @@ void ClFullyConnected::run(ITensorPack &tensors) CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false); // Linearize input if it comes from a convolutional layer - if(_is_fc_after_conv) + if (_is_fc_after_conv) { - ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } }; + ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}}; _flatten->run(flatten_pack); } ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); - if(_weights_to_use_idx != ACL_SRC_1) + if (_weights_to_use_idx != ACL_SRC_1) { gemm_pack.add_const_tensor(ACL_SRC_1, weights.get()); } // Run MatMul Op - if(_use_matmul) + if (_use_matmul) { // Run matmul kernels for matrix multiplication - if(_is_quantized) + if (_is_quantized) { CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, gemm_pack, true); } @@ -568,7 +620,7 @@ void ClFullyConnected::run(ITensorPack &tensors) else { // Run matrix multiply - if(_is_quantized) + if (_is_quantized) { _mm_gemmlowp->run(gemm_pack); } @@ -582,7 +634,7 @@ void ClFullyConnected::run(ITensorPack &tensors) void ClFullyConnected::prepare(ITensorPack &tensors) { // Note : Running prepare() each run when _use_matmul is true is unnecessary unless weights conversion is needed. - if(!_is_prepared || _dynamic_gemm) + if (!_is_prepared || _dynamic_gemm) { #ifdef ARM_COMPUTE_ASSERTS_ENABLED ++_asrt_prepare_count; @@ -598,10 +650,10 @@ void ClFullyConnected::prepare(ITensorPack &tensors) const ITensor *cur_weights = weights; // Reshape weights if needed. Disabled when matmul kernels are enabled as matmul fuses transpose. - if(_transpose_weights && !_use_matmul) + if (_transpose_weights && !_use_matmul) { // Run reshape weights kernel and mark weights as unused - ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } }; + ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}}; _reshape_weights->run(transpose_pack); cur_weights->mark_as_unused(); @@ -609,9 +661,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors) } // Convert weights if needed - if(_run_convert_weights) + if (_run_convert_weights) { - ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } }; + ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}}; _convert_weights->run(convert_pack); cur_weights->mark_as_unused(); @@ -622,9 +674,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors) gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights); // Prepare GEMM prepare and release unused weights - if(_dynamic_gemm || !_use_matmul) + if (_dynamic_gemm || !_use_matmul) { - if(!_is_quantized) + if (!_is_quantized) { _mm_gemm->prepare(gemm_pack); } diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h index d975859d87..0621238ab5 100644 --- a/src/gpu/cl/operators/ClFullyConnected.h +++ b/src/gpu/cl/operators/ClFullyConnected.h @@ -47,7 +47,7 @@ namespace kernels { class ClMatMulNativeKernel; class ClMatMulLowpNativeKernel; -} +} // namespace kernels /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels: * * -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer) @@ -88,7 +88,11 @@ public: * Data type supported: Same as @p src. * @param[in] fc_info (Optional) Fully connected layer additional info */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -96,18 +100,36 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); // Inherited methods overriden - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: - void configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); - void configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); - void configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); + void configure_fc_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); + void configure_conv_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); + void configure_mm(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); private: enum AuxTensorIdx @@ -134,19 +156,19 @@ private: TensorInfo _reshaped_weights{}; TensorInfo _lhs_to_use{}; TensorInfo _weights_to_use{}; - int _weights_to_use_idx{ ACL_SRC_1 }; + int _weights_to_use_idx{ACL_SRC_1}; - bool _run_convert_weights{ false }; - bool _transpose_weights{ false }; - bool _dynamic_gemm{ false }; - bool _use_matmul{ false }; + bool _run_convert_weights{false}; + bool _transpose_weights{false}; + bool _dynamic_gemm{false}; + bool _use_matmul{false}; - bool _is_fc_after_conv{ true }; - bool _is_quantized{ false }; - bool _is_prepared{ false }; + bool _is_fc_after_conv{true}; + bool _is_quantized{false}; + bool _is_prepared{false}; #ifdef ARM_COMPUTE_ASSERTS_ENABLED - int _asrt_run_count {}; + int _asrt_run_count{}; int _asrt_prepare_count{}; #endif // ARM_COMPUTE_ASSERTS_ENABLED }; diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp index 7e331a86f3..815c254c69 100644 --- a/src/gpu/cl/operators/ClGemm.cpp +++ b/src/gpu/cl/operators/ClGemm.cpp @@ -33,11 +33,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/ITensorAllocator.h" +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/core/utils/helpers/float_ops.h" @@ -45,8 +46,6 @@ #include "src/gpu/cl/utils/ClAuxTensorHandler.h" #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h" #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" #include "utils/TypePrinter.h" @@ -67,35 +66,43 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) return kernel_type == CLGEMMKernelType::NATIVE ? false : true; } //Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type -inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights) +inline CLGEMMKernelType +auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights) { - if(!constant_weights) + if (!constant_weights) { return CLGEMMKernelType::NATIVE; } auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); - if(bool(gemm_kernel)) + if (bool(gemm_kernel)) { - if(validate_gemm_kernel(gemm_kernel.gemm_type)) + if (validate_gemm_kernel(gemm_kernel.gemm_type)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } } gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } // Validate lhs_info and rhs_info for reshaped only rhs kernel -inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, - const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info) +inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + GEMMKernelInfo gemm_kernel_info) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo tmp_b_info{}; // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } @@ -103,12 +110,14 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs gemm_kernel_info.lhs_info = lhs_info; gemm_kernel_info.rhs_info = rhs_info; gemm_kernel_info.has_pad_y = false; - if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) { return false; } gemm_kernel_info.has_pad_y = true; - if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) { return false; } @@ -116,49 +125,65 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs } //Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs -inline std::pair auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, - const ITensorInfo *b, - const ITensorInfo *c, const ITensorInfo *output) +inline std::pair +auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, + GEMMKernelInfo kernel_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output) { auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info)) + if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } // Validate lhs_info and rhs_info for reshaped kernel -inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, - const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d) +inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + GEMMKernelInfo gemm_kernel_info, + bool reinterpret_input_as_3d) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel TensorInfo tmp_a_info{}; TensorInfo tmp_b_info{}; // Validate reshape LHS kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d))); - if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d))) + auto_init_if_empty(tmp_a_info, + a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d))); + if (!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d))) { return false; } // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } // Validate mm kernel gemm_kernel_info.lhs_info = lhs_info; gemm_kernel_info.rhs_info = rhs_info; - if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + if (!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) { return false; } @@ -166,21 +191,32 @@ inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, co } //Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs -inline std::pair auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b, - const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d) +inline std::pair +auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, + GEMMKernelInfo kernel_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + bool reinterpret_input_as_3d) { auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d)) + if (validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, + reinterpret_input_as_3d)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_reshaped(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), + to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } // namespace @@ -200,18 +236,24 @@ ClGemm::ClGemm() { } -void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_native(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -225,24 +267,32 @@ void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorIn // Set the target for the kernels _mm_native_kernel->set_target(gpu_target); - auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); // Configure and tune matrix multiply kernel - _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info); + _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, + kernel_info); } -void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_reshaped(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -261,32 +311,42 @@ void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensor GEMMRHSMatrixInfo rhs_info{}; // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, - c, output, gemm_info.reinterpret_input_as_3d()); + std::tie(lhs_info, rhs_info) = + auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, + kernel_info, a, b, c, output, gemm_info.reinterpret_input_as_3d()); _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d()); _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); // Configure and tune matrix multiply kernel - _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, + kernel_info); // Request memory for LHS and RHS reshape matrix _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size()); - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); } -void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -304,7 +364,8 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context GEMMRHSMatrixInfo rhs_info{}; // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, kernel_info, a, b, c, output); // Transpose matrix _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); @@ -315,24 +376,33 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context // Configure matrix multiply kernel with no y padding support kernel_info.has_pad_y = false; - _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, + kernel_info); // Request memory for RHS reshape matrix - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); } -void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -350,9 +420,10 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co GEMMRHSMatrixInfo rhs_info{}; // Pick up the GEMM configuration - auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; // Force H0 to 4 in order to use the MMUL extension rhs_info.h0 = 4; @@ -361,13 +432,22 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co // Configure matrix multiply kernel with no y padding support kernel_info.has_pad_y = false; - _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, + rhs_info, kernel_info); // Request memory for RHS reshape matrix - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); } -Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_native(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -376,12 +456,12 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const const GPUTarget gpu_target = CLScheduler::get().target(); DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -392,15 +472,23 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const kernel_info.broadcast_bias = broadcast_bias; kernel_info.activation_info = gemm_info.activation_info(); - auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate( + a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info)); return Status{}; } -Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_reshaped(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -412,12 +500,12 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con const GPUTarget gpu_target = CLScheduler::get().target(); DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -433,23 +521,33 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + const auto gemm_config = + select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape( + compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, + beta, lhs_info, rhs_info, kernel_info)); return Status{}; } -Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -460,12 +558,12 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf const GPUTarget gpu_target = CLScheduler::get().target(); const DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -481,24 +579,33 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + const auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); // Validate matrix multiply kernel_info.has_pad_y = false; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); kernel_info.has_pad_y = true; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); return Status{}; } -Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -508,12 +615,12 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens const GPUTarget gpu_target = CLScheduler::get().target(); const DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -529,9 +636,10 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + const auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; // Force H0 to 4 in order to use the MMUL extension rhs_info.h0 = 4; @@ -540,12 +648,20 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens // Validate matrix multiply kernel_info.has_pad_y = false; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); return Status{}; } -void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +void ClGemm::configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); @@ -558,20 +674,21 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, _is_prepared = gemm_info.retain_internal_weights(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); // Select GEMMType - _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run, - b->are_values_constant()); + _gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{CLScheduler::get().target(), a->data_type(), m, n, k, batch_size}, + _reshape_b_only_on_first_run, b->are_values_constant()); const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; - switch(_gemm_kernel_type) + switch (_gemm_kernel_type) { case CLGEMMKernelType::NATIVE: { @@ -600,35 +717,41 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, } } -Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { // Get the GPU target bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); // Check data type early because the auto_select_gemm_kernel has assertions on supported data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16); // Select GEMMType - CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery - { - CLScheduler::get().target(), - a->data_type(), - m, - n, - k, - batch_size, - }, - gemm_info.reshape_b_only_on_first_run(), b->are_values_constant()); + CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{ + CLScheduler::get().target(), + a->data_type(), + m, + n, + k, + batch_size, + }, + gemm_info.reshape_b_only_on_first_run(), b->are_values_constant()); const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; - switch(gemm_kernel_type) + switch (gemm_kernel_type) { case CLGEMMKernelType::NATIVE: { @@ -647,7 +770,8 @@ Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso } case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: { - ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info)); break; } default: @@ -674,7 +798,7 @@ void ClGemm::run(ITensorPack &tensors) prepare(tensors); // Run matrix multiply kernel - switch(_gemm_kernel_type) + switch (_gemm_kernel_type) { case CLGEMMKernelType::NATIVE: { @@ -684,13 +808,13 @@ void ClGemm::run(ITensorPack &tensors) case CLGEMMKernelType::RESHAPED: { // Run interleave kernel - ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } }; + ITensorPack reshape_lhs_pack{{ACL_SRC, lhs}, {ACL_DST, lhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); } // Copy original tensor pack and overwrite lhs and rhs with reshaped counterparts @@ -698,7 +822,7 @@ void ClGemm::run(ITensorPack &tensors) gemm_reshaped_pack.add_const_tensor(ACL_SRC_0, lhs_reshaped.get()); gemm_reshaped_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED) { CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true); } @@ -706,10 +830,10 @@ void ClGemm::run(ITensorPack &tensors) } case CLGEMMKernelType::RESHAPED_ONLY_RHS: { - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); } // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement @@ -722,7 +846,7 @@ void ClGemm::run(ITensorPack &tensors) ITensorPack gemm_reshaped_onlyrhs_pack(tensors); gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); - if(has_pad_y) + if (has_pad_y) { ARM_COMPUTE_ERROR_ON(has_pad_y); } @@ -734,10 +858,10 @@ void ClGemm::run(ITensorPack &tensors) } case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: { - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); } // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement @@ -750,7 +874,7 @@ void ClGemm::run(ITensorPack &tensors) ITensorPack gemm_reshaped_onlyrhs_pack(tensors); gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); - if(has_pad_y) + if (has_pad_y) { ARM_COMPUTE_ERROR_ON(has_pad_y); } @@ -769,20 +893,22 @@ void ClGemm::run(ITensorPack &tensors) void ClGemm::prepare(ITensorPack &constants) { - if(!_is_prepared) + if (!_is_prepared) { - const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1); - ICLTensor *rhs_aux = utils::cast::polymorphic_downcast(constants.get_tensor(offset_int_vec(RhsReshape))); + const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1); + ICLTensor *rhs_aux = + utils::cast::polymorphic_downcast(constants.get_tensor(offset_int_vec(RhsReshape))); // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed - if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux) + if ((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && + (src1 != nullptr && rhs_aux != nullptr) && rhs_aux) { ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!"); CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux); ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr); - ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, src1}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true); } _is_prepared = true; diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h index 11f9f2b3d8..85dc1d6c8f 100644 --- a/src/gpu/cl/operators/ClGemm.h +++ b/src/gpu/cl/operators/ClGemm.h @@ -90,30 +90,95 @@ public: * if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping * in case matrix A and matrix B have been already transformed. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemm::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: - void configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure_native(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped_only_rhs(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); - static Status validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + static Status validate_native(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped_only_rhs(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); private: enum AuxTensorIdx diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp index 5620471ff9..55d815a1ef 100644 --- a/src/gpu/cl/operators/ClGemmConv2d.cpp +++ b/src/gpu/cl/operators/ClGemmConv2d.cpp @@ -28,10 +28,12 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClActivationKernel.h" @@ -41,8 +43,6 @@ #include "src/gpu/cl/operators/ClGemm.h" #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" #include "src/gpu/cl/utils/ClAuxTensorHandler.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" namespace arm_compute @@ -53,18 +53,38 @@ using namespace utils::cast; namespace opencl { ClGemmConv2d::ClGemmConv2d() - : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(), - _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count) + : _weights_reshape_kernel(nullptr), + _im2col_kernel(nullptr), + _mm_gemm(nullptr), + _mm_gemmlowp(nullptr), + _col2im_kernel(nullptr), + _activation_kernel(nullptr), + _im2col_output(), + _weights_reshaped(), + _gemm_output(), + _skip_im2col(false), + _skip_col2im(false), + _is_quantized(false), + _fuse_activation(true), + _append_bias(false), + _is_prepared(false), + _aux_mem(AuxTensorIdx::Count) { } ClGemmConv2d::~ClGemmConv2d() = default; -void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, +void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth, const ActivationLayerInfo &act_info) + int gemm_3d_depth, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info)); const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped false, // is_b_reshaped @@ -77,18 +97,20 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I false, // fp_mixed_precision true, // broadcast_bias act_info // activation_info - ); + ); - TensorInfo tmp_src{ *src }; - if(_is_quantized) + TensorInfo tmp_src{*src}; + if (_is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset const QuantizationInfo input_quantization_info = src->quantization_info(); const QuantizationInfo weights_quantization_info = weights->quantization_info(); - tmp_src.set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + tmp_src.set_quantization_info( + QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights->set_quantization_info( + QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); _mm_gemmlowp = std::make_unique(); _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info); @@ -97,7 +119,7 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I weights->set_quantization_info(weights_quantization_info); auto mm_mem_req = _mm_gemmlowp->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } @@ -108,15 +130,21 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I _mm_gemm = std::make_unique(); _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info); auto mm_mem_req = _mm_gemm->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } } } -Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info) +Status ClGemmConv2d::validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, + bool skip_im2col, + const ActivationLayerInfo &act_info) { const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type()); @@ -131,9 +159,9 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig false, // fp_mixed_precision true, // broadcast_bias act_info // activation_info - ); + ); - if(is_quantized) + if (is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -142,8 +170,10 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig std::unique_ptr src_qa = src->clone(); std::unique_ptr weights_qa = weights->clone(); - src_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + src_qa->set_quantization_info( + QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights_qa->set_quantization_info( + QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Perform validation step on GEMMLowp return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info); @@ -155,14 +185,17 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig } } -void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info) +void ClGemmConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, - conv2d_info, - weights_info)); + ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info); const DataType data_type = src->data_type(); @@ -180,7 +213,8 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf _is_prepared = weights_info.retain_internal_weights(); _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); + _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); _skip_col2im = data_layout == DataLayout::NHWC; // Only for quantize there are few cases where we cannot fuse the activation function in GEMM @@ -197,12 +231,8 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf // Get convolved dimensions unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv2d_info.conv_info, - conv2d_info.dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv2d_info.conv_info, conv2d_info.dilation); unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups; @@ -210,28 +240,31 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf _append_bias = false; _weights_reshape_kernel = std::make_unique(); - if(conv2d_info.num_groups != 1 && biases != nullptr) + if (conv2d_info.num_groups != 1 && biases != nullptr) { // num_groups != 1 can only be for NCHW // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor biases_to_use = nullptr; _append_bias = true; - _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, conv2d_info.num_groups); + _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, + conv2d_info.num_groups); } else { - _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, conv2d_info.num_groups); + _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, + conv2d_info.num_groups); } // Create tensor to store im2col reshaped inputs - if(!_skip_im2col) + if (!_skip_im2col) { // Configure and tune im2col. im2col output shape is auto-initialized _im2col_kernel = std::make_unique(); // Set the GPU target for im2col _im2col_kernel->set_target(CLScheduler::get().target()); - _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups); + _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), + conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups); // Set quantization info _im2col_output.set_quantization_info(src->quantization_info()); @@ -242,7 +275,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf } // Create GEMM output tensor - if(!_skip_col2im) + if (!_skip_col2im) { TensorShape shape_gemm; @@ -263,7 +296,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf gemmlowp_output_stage.gemmlowp_offset = 0; // Configure output stage for quantized case - if(_is_quantized) + if (_is_quantized) { const auto output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info; const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); @@ -286,16 +319,16 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf auto min_activation = min_val.get(); auto max_activation = max_val.get(); - const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; + const std::set supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; - if(conv2d_info.act_info.enabled()) + if (conv2d_info.act_info.enabled()) { - if(supported_acts.count(conv2d_info.act_info.activation()) != 0) + if (supported_acts.count(conv2d_info.act_info.activation()) != 0) { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); } else { @@ -313,48 +346,60 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info); + configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, + gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info); - if(!_skip_col2im) + if (!_skip_col2im) { // Set the GPU target for col2im _col2im_kernel = std::make_unique(); _col2im_kernel->set_target(CLScheduler::get().target()); // Configure and tune Col2Im - _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups); + _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), + conv2d_info.num_groups); CLScheduler::get().tune_kernel_static(*_col2im_kernel.get()); } ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h), "Output shape does not match the expected one"); - if(!_fuse_activation) + if (!_fuse_activation) { _activation_kernel = std::make_unique(); _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info); } - _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); - _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size()); - _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); + _aux_mem[Im2ColOutput] = + MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); + _aux_mem[WeightsReshaped] = + MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size()); + _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); } -Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, +Status ClGemmConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); - if(!is_quantized_per_channel) + if (!is_quantized_per_channel) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && (src->data_layout() == DataLayout::NCHW)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), + "Grouping (num_groups != 1) is not supported with QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && + (src->data_layout() == DataLayout::NCHW)); const DataLayout data_layout = src->data_layout(); const DataType data_type = src->data_type(); @@ -374,18 +419,19 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights const ITensorInfo *gemm_output_to_use = dst; const ITensorInfo *weights_to_use = weights; const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 - && conv2d_info.conv_info.stride().second == 1); - const bool skip_col2im = data_layout == DataLayout::NHWC; - bool fuse_activation = true; + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); + const bool skip_col2im = data_layout == DataLayout::NHWC; + bool fuse_activation = true; - ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel)); + ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != + src->dimension(idx_channel)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -397,7 +443,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); } - if(conv2d_info.act_info.enabled()) + if (conv2d_info.act_info.enabled()) { ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a()); } @@ -406,48 +452,50 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv2d_info.conv_info, - conv2d_info.dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv2d_info.conv_info, conv2d_info.dilation); unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups; const ITensorInfo *biases_to_use = biases; bool append_bias = false; - if(conv2d_info.num_groups != 1 && biases != nullptr) + if (conv2d_info.num_groups != 1 && biases != nullptr) { // num_groups != 1 can only be for NCHW // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor - biases_to_use = nullptr; - append_bias = true; - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type); + biases_to_use = nullptr; + append_bias = true; + weights_reshaped_info = + TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type); } else { - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type); + weights_reshaped_info = + TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type); } weights_to_use = &weights_reshaped_info; - if(!skip_im2col) + if (!skip_im2col) { const Size2D kernel_dims(kernel_width, kernel_height); // Output tensor auto initialization if not yet initialized - TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups == 1, conv2d_info.num_groups); + TensorShape expected_output_shape = + compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, + conv2d_info.num_groups == 1, conv2d_info.num_groups); auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape)); - ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, + append_bias, conv2d_info.dilation, conv2d_info.num_groups)); gemm_input_to_use = &im2col_reshaped_info; } // Create GEMM output tensor - if(!skip_col2im) + if (!skip_col2im) { TensorShape shape_gemm; @@ -465,7 +513,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights gemmlowp_output_stage.gemmlowp_offset = 0; gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; - if(is_quantized) + if (is_quantized) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -483,16 +531,16 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights int min_activation = 0; int max_activation = 0; - const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; + const std::set supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; - if(conv2d_info.act_info.enabled()) + if (conv2d_info.act_info.enabled()) { - if(supported_acts.count(conv2d_info.act_info.activation()) != 0) + if (supported_acts.count(conv2d_info.act_info.activation()) != 0) { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); } else { @@ -509,16 +557,18 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, + gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info)); // Validate Col2Im - if(!skip_col2im) + if (!skip_col2im) { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups)); } // Validate Activation Layer - if(!fuse_activation) + if (!fuse_activation) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info)); } @@ -541,30 +591,26 @@ void ClGemmConv2d::run(ITensorPack &tensors) CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false); // Run im2col - if(!_skip_im2col) + if (!_skip_im2col) { - ITensorPack pack = - { - { TensorType::ACL_SRC, src }, - { TensorType::ACL_DST, im2col_output.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}}; CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false); gemm_input_to_use = im2col_output.get(); } - if(!_skip_col2im) + if (!_skip_col2im) { gemm_output_to_use = gemm_output.get(); } ITensorPack pack_mm = tensors; pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get()); - if(!_append_bias) + if (!_append_bias) { pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases); } pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use); // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions - if(_is_quantized) + if (_is_quantized) { // Run gemmlowp _mm_gemmlowp->run(pack_mm); @@ -576,43 +622,32 @@ void ClGemmConv2d::run(ITensorPack &tensors) } // Reshape output matrix - if(!_skip_col2im) + if (!_skip_col2im) { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output_to_use }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false); } //Run Activation Layer if we cannot fuse in GEMM - if(!_fuse_activation) + if (!_fuse_activation) { - ITensorPack pack = - { - { TensorType::ACL_SRC, dst }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}}; CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false); } } void ClGemmConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { // Run weights reshaping and mark original weights tensor as unused - ICLTensor *weights_reshaped_p = utils::cast::polymorphic_downcast(tensors.get_tensor(offset_int_vec(WeightsReshaped))); + ICLTensor *weights_reshaped_p = + utils::cast::polymorphic_downcast(tensors.get_tensor(offset_int_vec(WeightsReshaped))); CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p); auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - ITensorPack pack = - { - { TensorType::ACL_SRC, weights }, - { TensorType::ACL_DST, weights_reshaped.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}}; - if(_append_bias) + if (_append_bias) { const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); pack.add_const_tensor(TensorType::ACL_BIAS, biases); diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h index 8a46ee2dc3..e8f3147ac3 100644 --- a/src/gpu/cl/operators/ClGemmConv2d.h +++ b/src/gpu/cl/operators/ClGemmConv2d.h @@ -27,6 +27,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -100,15 +101,24 @@ public: * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights * tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info = WeightsInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemmConvolution::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &conv2d_info, + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info = WeightsInfo()); // Inherited methods overridden: @@ -130,9 +140,14 @@ private: * @param[in] gemm_3d_depth Depth of GEMM 3D * @param[in] act_info Activation to apply after the matrix multiplication */ - void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + void configure_mm(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth, const ActivationLayerInfo &act_info); + int gemm_3d_depth, + const ActivationLayerInfo &act_info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines * * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -148,8 +163,14 @@ private: * * @return a status */ - static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info); + static Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, + bool skip_im2col, + const ActivationLayerInfo &act_info); enum AuxTensorIdx { diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp index 2622274587..71c247de79 100644 --- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp @@ -52,7 +52,7 @@ namespace { inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) { - switch(kernel_type) + switch (kernel_type) { case CLGEMMKernelType::NATIVE: case CLGEMMKernelType::RESHAPED_ONLY_RHS: @@ -71,32 +71,41 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run) { auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); - if(bool(gemm_kernel)) + if (bool(gemm_kernel)) { - if(validate_gemm_kernel(gemm_kernel.gemm_type)) + if (validate_gemm_kernel(gemm_kernel.gemm_type)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } } gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } // Validate lhs_info and rhs_info for native kernel -inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info) +inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const GEMMReshapeInfo &reshape_info) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo mm_result_s32_info{}; // Output tensor auto initialization if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32)); + auto_init_if_empty( + mm_result_s32_info, + a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32)); // Validate mm kernel // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info // NOTE: This assumes: // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments). // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window). - if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info))) + if (!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, + reshape_info))) { return false; } @@ -104,31 +113,45 @@ inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, cons } // Automatically select between mlgo (prioritized) and default heuristics for native kernel configs -std::pair auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info) +std::pair auto_select_gemm_config_native(auto_heuristics::CommonQuery query, + const ITensorInfo *a, + const ITensorInfo *b, + const GEMMReshapeInfo &reshape_info) { auto config = auto_heuristics::select_mlgo_gemm_config_native(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info)) + if (validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_native(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } // Validate lhs_info and rhs_info for reshaped only rhs kernel -inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, - unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d) +inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output, + unsigned int m, + unsigned int n, + unsigned int k, + bool reinterpret_input_as_3d, + int depth_output_gemm3d) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo tmp_b_info{}; // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } @@ -148,7 +171,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs // Since we ignore the output stage, output data type has to be S32 to pass the validation TensorInfo output_info_copy(*output); output_info_copy.set_data_type(DataType::S32); - if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) + if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, + gemm_kernel_info))) { return false; } @@ -156,14 +180,22 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs } // Validate lhs_info and rhs_info for reshaped only rhs kernel -inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, - unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d) +inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output, + unsigned int m, + unsigned int n, + unsigned int k, + bool reinterpret_input_as_3d, + int depth_output_gemm3d) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo tmp_b_info{}; // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } @@ -183,7 +215,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo // Since we ignore the output stage, output data type has to be S32 to pass the validation TensorInfo output_info_copy(*output); output_info_copy.set_data_type(DataType::S32); - if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) + if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, + gemm_kernel_info))) { return false; } @@ -191,40 +224,55 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo } // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs -std::pair auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d, - const ITensorInfo *a, - const ITensorInfo *b, const ITensorInfo *output) +std::pair +auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, + bool reinterpret_input_as_3d, + int depth_output_gemm3d, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output) { auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d)) + if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, + query.k, reinterpret_input_as_3d, depth_output_gemm3d)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs -std::pair auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d, - const ITensorInfo *a, - const ITensorInfo *b, const ITensorInfo *output) +std::pair +auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, + bool reinterpret_input_as_3d, + int depth_output_gemm3d, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output) { ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d); auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); - validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), - to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, + query.k, reinterpret_input_as_3d, depth_output_gemm3d); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type) { - switch(kernel_type) + switch (kernel_type) { case CLGEMMKernelType::NATIVE: return false; @@ -254,8 +302,11 @@ ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore() ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default; void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, - ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, - const GEMMInfo &gemm_info) + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info)); @@ -263,8 +314,8 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _a_offset = a->quantization_info().uniform().offset; - _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type()) - && a->data_type() == DataType::QASYMM8; + _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && + is_data_type_quantized_symmetric(b->data_type()) && a->data_type() == DataType::QASYMM8; _b_offset = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset; _gemm_info = gemm_info; @@ -282,17 +333,18 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con // Arguments used by GEMMReshapeInfo // in order to know how the matrices have been reshaped bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); - _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run); + _gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, _reshape_b_only_on_first_run); - if(_convert_to_qasymm8) + if (_convert_to_qasymm8) { // Set data type for converted weights _qasymm8_weights = *b; @@ -301,47 +353,50 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con } ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { matrix_b = &_tmp_b; // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d, - depth_output_gemm3d, - a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d, + depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); // Configure reshape RHS kernel - _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); + _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, + rhs_info); } - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { matrix_b = &_tmp_b; // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d, - depth_output_gemm3d, - a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d, + depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); // Configure reshape RHS kernel - _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); + _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, + rhs_info); } // Using default reduction info - const GEMMLowpReductionKernelInfo reduction_info {}; + const GEMMLowpReductionKernelInfo reduction_info{}; // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) + if (_a_offset != 0) { _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); + _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, + &_vector_sum_col, reduction_info); } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); @@ -360,17 +415,19 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con gemm_kernel_info.a_offset = _a_offset; gemm_kernel_info.b_offset = _b_offset; // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { // Configure offset contribution kernel - const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32); _gemm_output_stage_shifts = TensorInfo(TensorShape(num_filters), 1, DataType::S32); GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); gemmlowp_output_stage.output_data_type = a->data_type(); - if(num_filters == 1) + if (num_filters == 1) { // Per-channel quantization with OFM == 1 is equivalent to uniform quantization. // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts @@ -379,55 +436,67 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con gemm_kernel_info.output_stage = gemmlowp_output_stage; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && + gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_reshaped_only_rhs_kernel->configure( + compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); } - else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && + gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_reshaped_only_rhs_mmul_kernel->configure( + compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); } else { _run_output_stage = true; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { - _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, + gemm_kernel_info); } - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { - _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info); + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, + gemm_kernel_info); } else { // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, - a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a, + _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info); // Configure matrix multiply kernel - _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info); - - _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, - c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage, - &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, + reshape_info); + + _offset_contribution_output_stage_kernel->configure( + compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, output, a->dimension(0), + _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, + &_gemm_output_stage_shifts); } } } else { _run_offset_contribution = true; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { // Configure and tune matrix multiply kernel _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info); } - else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { // Configure and tune matrix multiply kernel _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info); @@ -436,44 +505,65 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con { // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, - a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a, + _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info); // Configure matrix multiply kernel _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info); } // Configure offset contribution kernel - _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, - c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset); + _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + a->dimension(0), _a_offset, _b_offset); } // Request memory - _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size()); - if(is_gemm_reshaped(_gemm_kernel_type)) + _aux_mem[RhsQAsymm8] = + MemoryInfo(offset_int_vec(RhsQAsymm8), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _qasymm8_weights.total_size()); + if (is_gemm_reshaped(_gemm_kernel_type)) { // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation - _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size()); - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); - } - if(_a_offset != 0) - { - _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size()); - } - if(_b_offset != 0) - { - _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); - } - _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); - _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size()); - _aux_mem[Shifts] = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size()); + _aux_mem[RhsQAsymm8] = + MemoryInfo(offset_int_vec(RhsQAsymm8), + _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, + _qasymm8_weights.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + } + if (_a_offset != 0) + { + _aux_mem[VecSumCol] = + MemoryInfo(offset_int_vec(VecSumCol), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _vector_sum_col.total_size()); + } + if (_b_offset != 0) + { + _aux_mem[VecSumRow] = + MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); + } + _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); + _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, + _gemm_output_stage_multipliers.total_size()); + _aux_mem[Shifts] = + MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size()); } -Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); @@ -492,39 +582,44 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso const GPUTarget gpu_target = CLScheduler::get().target(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run())); + bool reshape_matrix_b = is_gemm_reshaped( + auto_select_gemm_kernel(auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, + gemm_info.reshape_b_only_on_first_run())); const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); - bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type()) - && is_data_type_quantized_asymmetric(a->data_type()); + bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && + is_data_type_quantized_symmetric(b->data_type()) && + is_data_type_quantized_asymmetric(a->data_type()); TensorInfo weights_info(*b); - if(convert_to_qasymm8) + if (convert_to_qasymm8) { b_offset = -128; weights_info.set_data_type(DataType::QASYMM8); ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP)); } const ITensorInfo *matrix_b_info = &weights_info; - if(reshape_matrix_b) + if (reshape_matrix_b) { matrix_b_info = &tmp_b_info; // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; + const auto res = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; // Validate reshape RHS kernel - auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); + auto_init_if_empty(tmp_b_info, + weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); } @@ -533,21 +628,23 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso const GEMMLowpReductionKernelInfo reduction_info; // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if(a_offset != 0) + if (a_offset != 0) { info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32); // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); } // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if(b_offset != 0) + if (b_offset != 0) { info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); } GEMMKernelInfo gemm_kernel_info; @@ -560,92 +657,99 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso gemm_kernel_info.rhs_info = rhs_info; gemm_kernel_info.a_offset = a_offset; gemm_kernel_info.b_offset = b_offset; - if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { - const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; - const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + const TensorInfo gemm_output_stage_multipliers_shifts_info( + TensorInfo(TensorShape(num_filters), 1, DataType::S32)); GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); gemmlowp_output_stage.output_data_type = a->data_type(); gemm_kernel_info.output_stage = gemmlowp_output_stage; - if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (reshape_matrix_b && + gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - &gemm_output_stage_multipliers_shifts_info, - &gemm_output_stage_multipliers_shifts_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, &gemm_output_stage_multipliers_shifts_info, + &gemm_output_stage_multipliers_shifts_info)); } else { TensorInfo mm_result_s32_info{}; - if(reshape_matrix_b) + if (reshape_matrix_b) { // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32)); + auto_init_if_empty(mm_result_s32_info, a->clone() + ->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, reshape_info)) + .set_data_type(DataType::S32)); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); } else { // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32)); + auto_init_if_empty(mm_result_s32_info, a->clone() + ->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, false, reshape_info)) + .set_data_type(DataType::S32)); // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; + const auto res = select_default_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - output, - a_offset, b_offset, - gemmlowp_output_stage, - &gemm_output_stage_multipliers_shifts_info, - &gemm_output_stage_multipliers_shifts_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, gemmlowp_output_stage, + &gemm_output_stage_multipliers_shifts_info, &gemm_output_stage_multipliers_shifts_info)); } } else { - if(reshape_matrix_b) + if (reshape_matrix_b) { // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, output, gemm_kernel_info)); } else { // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; + const auto res = select_default_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate( + matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); } - if(output->total_size() != 0) + if (output->total_size() != 0) { // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate( + output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, + c, a_offset, b_offset)); } } @@ -675,73 +779,61 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) const ITensor *matrix_a = a; const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b; - if(is_gemm_reshaped(_gemm_kernel_type)) + if (is_gemm_reshaped(_gemm_kernel_type)) { matrix_b = tmp_b.get(); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run reshape matrix B - ITensorPack mtx_b_reshape_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, tmp_b.get() } - }; + ITensorPack mtx_b_reshape_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, tmp_b.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false); } } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) + if (_a_offset != 0 && !_reshape_b_only_on_first_run) { - ITensorPack mtx_b_red_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, vec_sum_col.get() } - }; + ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, vec_sum_col.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); } // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { - ITensorPack mtx_a_red_pack = - { - { TensorType::ACL_SRC, matrix_a }, - { TensorType::ACL_DST, vec_sum_row.get() } - }; + ITensorPack mtx_a_red_pack = {{TensorType::ACL_SRC, matrix_a}, {TensorType::ACL_DST, vec_sum_row.get()}}; CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false); } // Run matrix multiply - if(is_gemm_reshaped(_gemm_kernel_type)) + if (is_gemm_reshaped(_gemm_kernel_type)) { ITensorPack gemm_reshaped_pack; - if(_run_offset_contribution) + if (_run_offset_contribution) { - gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b }, - { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst } - }); + gemm_reshaped_pack = ITensorPack({{TensorType::ACL_SRC_0, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_DST, _run_output_stage ? res32.get() : dst}}); } else { - gemm_reshaped_pack = ITensorPack( - { - { TensorType::ACL_SRC, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b }, - { TensorType::ACL_BIAS, c }, - { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, - { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }, - { TensorType::ACL_SHIFTS, shifts.get() }, - { TensorType::ACL_MULTIPLIERS, multipliers.get() }, - { TensorType::ACL_DST, dst }, + gemm_reshaped_pack = ITensorPack({ + {TensorType::ACL_SRC, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}, + {TensorType::ACL_SHIFTS, shifts.get()}, + {TensorType::ACL_MULTIPLIERS, multipliers.get()}, + {TensorType::ACL_DST, dst}, }); } - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false); } - else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false); } @@ -752,46 +844,39 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) } else { - ITensorPack gemm_native_pack = - { - { TensorType::ACL_SRC_0, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b }, - { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() } - }; + ITensorPack gemm_native_pack = {{TensorType::ACL_SRC_0, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get()}}; CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false); } - if(_run_output_stage) + if (_run_output_stage) { // Run offset contribution/output stage kernel - ITensorPack output_stage_pack = - { - { TensorType::ACL_SRC, res32.get() }, - { TensorType::ACL_BIAS, c }, - { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, - { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }, - { TensorType::ACL_SHIFTS, shifts.get() }, - { TensorType::ACL_MULTIPLIERS, multipliers.get() }, - { TensorType::ACL_DST, dst }, + ITensorPack output_stage_pack = { + {TensorType::ACL_SRC, res32.get()}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}, + {TensorType::ACL_SHIFTS, shifts.get()}, + {TensorType::ACL_MULTIPLIERS, multipliers.get()}, + {TensorType::ACL_DST, dst}, }; CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true); } - if(_run_offset_contribution) + if (_run_offset_contribution) { // Run offset contribution kernel - ITensorPack offset_contrib_pack = - { - { TensorType::ACL_SRC_DST, dst }, - { TensorType::ACL_BIAS, c }, - { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, - { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() } - }; + ITensorPack offset_contrib_pack = {{TensorType::ACL_SRC_DST, dst}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}}; CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true); } } void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true); @@ -800,56 +885,55 @@ void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) ARM_COMPUTE_ERROR_ON_NULLPTR(b); - if(_convert_to_qasymm8) + if (_convert_to_qasymm8) { - ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } }; + ITensorPack convert_to_qs8_pack = {{ACL_SRC, b}, {ACL_DST, rhs_qasymm8.get()}}; CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false); b->mark_as_unused(); } - if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run) + if (is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run) { // Run reshape kernel and mark original weights tensor as unused - ITensorPack mtx_b_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, tmp_b.get() } - }; + ITensorPack mtx_b_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, tmp_b.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false); b->mark_as_unused(); } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && _reshape_b_only_on_first_run) + if (_a_offset != 0 && _reshape_b_only_on_first_run) { - ITensorPack mtx_b_red_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, vec_sum_col.get() } - }; + ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, vec_sum_col.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); } // Compute GEMM output multipliers and shifts for output stage { - const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false); CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false); ICLTensor *multiplier_tensor = multipliers.get(); - if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0) + if (multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0) { multiplier_tensor->map(CLScheduler::get().queue(), true); - std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t)); + std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), + _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), + num_filters * sizeof(int32_t)); multiplier_tensor->unmap(CLScheduler::get().queue()); } ICLTensor *shifts_tensor = shifts.get(); - if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0) + if (shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0) { shifts_tensor->map(CLScheduler::get().queue(), true); - std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t)); + std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), + _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t)); shifts_tensor->unmap(CLScheduler::get().queue()); } } diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h index 6e32a90fc4..c80dc3a182 100644 --- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h @@ -93,18 +93,27 @@ public: * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should be executed only for the first run */ - void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemmLowpMatrixMultiplyCore::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: @@ -130,7 +139,7 @@ private: std::unique_ptr _mtx_a_reduction_kernel; std::unique_ptr _mtx_b_reduction_kernel; std::unique_ptr _offset_contribution_kernel; - std::unique_ptr _offset_contribution_output_stage_kernel; + std::unique_ptr _offset_contribution_output_stage_kernel; // Temporary tensors TensorInfo _qasymm8_weights{}; @@ -141,13 +150,13 @@ private: TensorInfo _gemm_output_stage_multipliers{}; TensorInfo _gemm_output_stage_shifts{}; - int32_t _a_offset{ 0 }; - int32_t _b_offset{ 0 }; - bool _reshape_b_only_on_first_run{ false }; - bool _run_output_stage{ false }; - bool _convert_to_qasymm8{ false }; - bool _run_offset_contribution{ false }; - bool _is_prepared{ false }; + int32_t _a_offset{0}; + int32_t _b_offset{0}; + bool _reshape_b_only_on_first_run{false}; + bool _run_output_stage{false}; + bool _convert_to_qasymm8{false}; + bool _run_offset_contribution{false}; + bool _is_prepared{false}; GEMMInfo _gemm_info{}; CLGEMMKernelType _gemm_kernel_type{}; diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp index a61b11a3b1..e3363e3685 100644 --- a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp +++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp @@ -27,22 +27,25 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h" #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h" #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { @@ -70,12 +73,16 @@ void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, c } } -Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM16); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info); @@ -94,7 +101,7 @@ void ClGemmLowpOutputStage::run(ITensorPack &tensors) const ITensor *bias = tensors.get_const_tensor(ACL_BIAS); ITensor *dst = tensors.get_tensor(ACL_DST); - ITensorPack pack{ { ACL_SRC, src }, { ACL_BIAS, bias }, { ACL_DST, dst } }; + ITensorPack pack{{ACL_SRC, src}, {ACL_BIAS, bias}, {ACL_DST, dst}}; CLScheduler::get().enqueue_op(*_kernel, pack, true); } } // namespace opencl diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h index 3f1b04dcce..6357e0200b 100644 --- a/src/gpu/cl/operators/ClGemmLowpOutputStage.h +++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h @@ -71,14 +71,21 @@ public: * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED * @param[in] info GEMMLowp output stage metadata. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemmLowpOutputStage::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/gpu/cl/operators/ClIndirectConv2d.cpp b/src/gpu/cl/operators/ClIndirectConv2d.cpp index b900974574..777fc9e5e1 100644 --- a/src/gpu/cl/operators/ClIndirectConv2d.cpp +++ b/src/gpu/cl/operators/ClIndirectConv2d.cpp @@ -27,16 +27,15 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h" #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" #include "src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h" #include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/gpu/cl/utils/ClAuxTensorHandler.h" - -#include "src/common/utils/Log.h" - using namespace arm_compute::cl_indirect_conv; namespace arm_compute @@ -47,7 +46,8 @@ using namespace arm_compute::experimental; namespace { -DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo +config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) { // Get GPU target GPUTarget gpu_target = CLScheduler::get().target(); @@ -59,8 +59,13 @@ DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo * } // namespace -void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void ClIndirectConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info); @@ -86,25 +91,29 @@ void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITenso CLScheduler::get().tune_kernel_static(*_indirect_conv_kernel); // Request memory for the indirect buffer - _aux_mem[IndirectBuffer] = MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size()); + _aux_mem[IndirectBuffer] = + MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size()); } -Status ClIndirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +Status ClIndirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { // Initialize the direct convolution descriptor const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info); - TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), - src->data_layout(), - weights->tensor_shape(), - conv_info, - desc); + TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape( + src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc); TensorInfo indirect_buffer(ind_buffer_shape, 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(src, weights, &indirect_buffer, conv_info, desc)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst, conv_info, act_info, desc)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate( + src, weights, &indirect_buffer, conv_info, desc)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst, + conv_info, act_info, desc)); return Status{}; } @@ -124,9 +133,10 @@ void ClIndirectConv2d::run(ITensorPack &tensors) void ClIndirectConv2d::prepare(ITensorPack &constants) { - if(!_is_prepared) + if (!_is_prepared) { - ICLTensor *indirect_buffer_aux = utils::cast::polymorphic_downcast(constants.get_tensor(offset_int_vec(IndirectBuffer))); + ICLTensor *indirect_buffer_aux = + utils::cast::polymorphic_downcast(constants.get_tensor(offset_int_vec(IndirectBuffer))); ARM_COMPUTE_ERROR_ON(indirect_buffer_aux == nullptr); ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Preparing indirect buffer"); @@ -134,7 +144,7 @@ void ClIndirectConv2d::prepare(ITensorPack &constants) CLAuxTensorHandler indirect_buffer(_indirect_buffer, *indirect_buffer_aux); ARM_COMPUTE_ERROR_ON(indirect_buffer.get()->cl_buffer().get() == nullptr); - ITensorPack indirect_buffer_pack{ { ACL_DST, indirect_buffer.get() } }; + ITensorPack indirect_buffer_pack{{ACL_DST, indirect_buffer.get()}}; CLScheduler::get().enqueue_op(*_addr_precalculation_kernel, indirect_buffer_pack, true); _is_prepared = true; diff --git a/src/gpu/cl/operators/ClIndirectConv2d.h b/src/gpu/cl/operators/ClIndirectConv2d.h index e50fa25069..29e796efd9 100644 --- a/src/gpu/cl/operators/ClIndirectConv2d.h +++ b/src/gpu/cl/operators/ClIndirectConv2d.h @@ -77,7 +77,12 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -85,12 +90,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: @@ -100,11 +109,11 @@ private: Count }; - std::unique_ptr _indirect_conv_kernel{ nullptr }; - std::unique_ptr _addr_precalculation_kernel{ nullptr }; + std::unique_ptr _indirect_conv_kernel{nullptr}; + std::unique_ptr _addr_precalculation_kernel{nullptr}; TensorInfo _indirect_buffer{}; - bool _is_prepared{ false }; - experimental::MemoryRequirements _aux_mem{ Count }; + bool _is_prepared{false}; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp index b2eb89b320..d8d4186d00 100644 --- a/src/gpu/cl/operators/ClLogicalNot.cpp +++ b/src/gpu/cl/operators/ClLogicalNot.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClLogicalNot.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp index 49d14127ca..c14b1f2992 100644 --- a/src/gpu/cl/operators/ClMatMul.cpp +++ b/src/gpu/cl/operators/ClMatMul.cpp @@ -47,11 +47,17 @@ ClMatMul::ClMatMul() { } -Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) +Status ClMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); const GPUTarget gpu_target = CLScheduler::get().target(); @@ -61,11 +67,16 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type()); - return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info) : - ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info) + : ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); } -void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) +void ClMatMul::configure(const CLCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info); @@ -81,12 +92,13 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info); - if(_is_quantized) + if (_is_quantized) { _matmul_lowp_native_kernel->set_target(gpu_target); // Configure the low-precision native matrix multiply kernel - _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, + act_info); } else { @@ -99,7 +111,7 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l void ClMatMul::run(ITensorPack &tensors) { - if(_is_quantized) + if (_is_quantized) { CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true); } diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h index abbb75239a..64dcf217bd 100644 --- a/src/gpu/cl/operators/ClMatMul.h +++ b/src/gpu/cl/operators/ClMatMul.h @@ -26,6 +26,7 @@ #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/function_info/MatMulInfo.h" + #include "src/gpu/cl/IClOperator.h" #include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h" #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" @@ -73,7 +74,11 @@ public: * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo. * @param[in] act_info Class containing information about fused activation function. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, + void configure(const CLCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -81,15 +86,19 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; private: - std::unique_ptr _matmul_native_kernel{ nullptr }; - std::unique_ptr _matmul_lowp_native_kernel{ nullptr }; + std::unique_ptr _matmul_native_kernel{nullptr}; + std::unique_ptr _matmul_lowp_native_kernel{nullptr}; - bool _is_quantized{ false }; + bool _is_quantized{false}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp index 2066f0cfaa..10cf8a6a38 100644 --- a/src/gpu/cl/operators/ClMul.cpp +++ b/src/gpu/cl/operators/ClMul.cpp @@ -24,17 +24,23 @@ #include "src/gpu/cl/operators/ClMul.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClMulKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClMulKernel.h" namespace arm_compute { namespace opencl { -void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void ClMul::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); auto k = std::make_unique(); @@ -42,22 +48,34 @@ void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1 _kernel = std::move(k); } -Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status ClMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); } -void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClComplexMul::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { auto k = std::make_unique(); k->configure(compile_context, src1, src2, dst, act_info); _kernel = std::move(k); } -Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClComplexMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info); } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h index 6086bc9d52..1cf4d68d4c 100644 --- a/src/gpu/cl/operators/ClMul.h +++ b/src/gpu/cl/operators/ClMul.h @@ -66,16 +66,27 @@ public: * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClComplexMulKernel */ @@ -92,14 +103,21 @@ public: * @param[out] dst The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClComplexMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp index cf4ebe6083..f3efd00bba 100644 --- a/src/gpu/cl/operators/ClPRelu.cpp +++ b/src/gpu/cl/operators/ClPRelu.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClPRelu.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" namespace arm_compute { namespace opencl { using KernelType = kernels::ClArithmeticKernel; -void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output) +void ClPRelu::configure(const CLCompileContext &compile_context, + ITensorInfo *input, + ITensorInfo *alpha, + ITensorInfo *output) { ARM_COMPUTE_LOG_PARAMS(input, alpha, output); auto k = std::make_unique(); @@ -49,7 +51,7 @@ void ClPRelu::run(ITensorPack &tensors) { // Output tensor can be given as nullptr for in-place computation. // In this case, get the input tensor and use it as the output tensor. - if(tensors.get_tensor(TensorType::ACL_DST) == nullptr) + if (tensors.get_tensor(TensorType::ACL_DST) == nullptr) { auto src_tensor = const_cast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation"); @@ -58,4 +60,4 @@ void ClPRelu::run(ITensorPack &tensors) IClOperator::run(tensors); } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h index 8084ab86cd..45ce858fb0 100644 --- a/src/gpu/cl/operators/ClPRelu.h +++ b/src/gpu/cl/operators/ClPRelu.h @@ -47,7 +47,8 @@ public: * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input. * @param[out] output Destination tensor. Data type supported: same as @p input */ - void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output); + void + configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPRelu::configure() diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp index ed56f97bfe..3851e22b6a 100644 --- a/src/gpu/cl/operators/ClPermute.cpp +++ b/src/gpu/cl/operators/ClPermute.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClPermute.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClPermuteKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) +void ClPermute::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm) { ARM_COMPUTE_LOG_PARAMS(src, dst, perm); auto k = std::make_unique(); @@ -45,4 +47,4 @@ Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const return kernels::ClPermuteKernel::validate(src, dst, perm); } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h index 3e87329f9b..6349358a18 100644 --- a/src/gpu/cl/operators/ClPermute.h +++ b/src/gpu/cl/operators/ClPermute.h @@ -44,7 +44,10 @@ public: * @param[in] dst The dst tensor info. Data types supported: Same as @p src * @param[in] perm Permutation vector */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPermute::configure() @@ -55,4 +58,4 @@ public: }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_PERMUTE_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CL_PERMUTE_H */ diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp index 3da90b8ced..e4507dc1a1 100644 --- a/src/gpu/cl/operators/ClPool2d.cpp +++ b/src/gpu/cl/operators/ClPool2d.cpp @@ -25,16 +25,19 @@ #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClPool2dKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices) +void ClPool2d::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + ITensorInfo *indices) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, dst, info, indices); @@ -49,7 +52,10 @@ void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *s CLScheduler::get().tune_kernel_static(*_kernel); } -Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices) +Status ClPool2d::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &info, + const ITensorInfo *indices) { return kernels::ClPool2dKernel::validate(src, dst, info, indices); } diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h index f353ba262e..9c2fd1c3f2 100644 --- a/src/gpu/cl/operators/ClPool2d.h +++ b/src/gpu/cl/operators/ClPool2d.h @@ -50,14 +50,21 @@ public: * @param[in] info Pooling layer parameters. * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + ITensorInfo *indices = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPool2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &info, + const ITensorInfo *indices = nullptr); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPool3d.cpp b/src/gpu/cl/operators/ClPool3d.cpp index 7dec6c5958..d230413659 100644 --- a/src/gpu/cl/operators/ClPool3d.cpp +++ b/src/gpu/cl/operators/ClPool3d.cpp @@ -25,16 +25,18 @@ #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClPool3dKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClPool3d::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info) +void ClPool3d::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, dst, info); diff --git a/src/gpu/cl/operators/ClPool3d.h b/src/gpu/cl/operators/ClPool3d.h index 7d994fd194..9fd78bfd69 100644 --- a/src/gpu/cl/operators/ClPool3d.h +++ b/src/gpu/cl/operators/ClPool3d.h @@ -51,7 +51,10 @@ public: * @param[out] dst Destination tensor info. * @param[in] info 3d Pooling layer parameters. */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info); + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPool3d::configure() diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp index 47ae5cea47..8560b5553e 100644 --- a/src/gpu/cl/operators/ClQuantize.cpp +++ b/src/gpu/cl/operators/ClQuantize.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClQuantizeKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClQuantizeKernel.h" namespace arm_compute { diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp index 560966f4fc..1dd5b760cb 100644 --- a/src/gpu/cl/operators/ClReshape.cpp +++ b/src/gpu/cl/operators/ClReshape.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClReshape.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClReshapeKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl @@ -45,4 +44,4 @@ Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) return kernels::ClReshapeKernel::validate(src, dst); } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp index 0798b19ca0..184e2aa006 100644 --- a/src/gpu/cl/operators/ClScale.cpp +++ b/src/gpu/cl/operators/ClScale.cpp @@ -25,17 +25,20 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClScaleKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) +void ClScale::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ScaleKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, dst, info); @@ -61,4 +64,4 @@ void ClScale::run(ITensorPack &tensors) CLScheduler::get().enqueue_op(*_kernel.get(), tensors); } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h index af97cf23e7..1427bb4fdc 100644 --- a/src/gpu/cl/operators/ClScale.h +++ b/src/gpu/cl/operators/ClScale.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_SCALE_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -49,7 +50,8 @@ public: * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info @ref ScaleKernelInfo descriptor to be used to configure */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClScale::configure() diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp index 03809553a3..2bec400597 100644 --- a/src/gpu/cl/operators/ClSoftmax.cpp +++ b/src/gpu/cl/operators/ClSoftmax.cpp @@ -22,7 +22,10 @@ * SOFTWARE. */ #include "src/gpu/cl/operators/ClSoftmax.h" + #include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" #include "src/gpu/cl/kernels/ClSoftmaxKernel.h" @@ -30,8 +33,6 @@ #include "src/gpu/cl/utils/ClAuxTensorHandler.h" #include "support/Cast.h" -#include "src/common/utils/Log.h" - using namespace arm_compute::experimental; namespace arm_compute @@ -52,7 +53,10 @@ ClSoftmax::ClSoftmax() { } -void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info) +void ClSoftmax::configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &dst, + const SoftmaxKernelInfo &info) { ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info)); ARM_COMPUTE_LOG_PARAMS(src, dst, info); @@ -64,14 +68,15 @@ void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensor const ITensorInfo &tmp_input_info = _needs_permute ? _permuted_src_info : src; ITensorInfo &tmp_output_info = _needs_permute ? _permuted_dst_info : dst; - if(_needs_permute) + if (_needs_permute) { const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info); } - DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type(); - _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type); + DataType tmp_data_type = + is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type(); + _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type); TensorShape max_sum_shape = tmp_input_info.tensor_shape(); _max_info = tmp_input_info.clone()->set_tensor_shape(max_sum_shape); @@ -83,33 +88,41 @@ void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensor _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info); _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info); - if(_needs_permute) + if (_needs_permute) { const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info); } - _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size()); - _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size()); - _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size()); - - _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size()); + _aux_mem[InternalTensorIdx::SUM] = + MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size()); + _aux_mem[InternalTensorIdx::TMP] = + MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size()); + _aux_mem[InternalTensorIdx::MAX] = + MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size()); + + _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), + MemoryLifetime::Temporary, _permuted_src_info.total_size()); + _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), + MemoryLifetime::Temporary, _permuted_dst_info.total_size()); } Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported"); ARM_COMPUTE_UNUSED(info.beta); - ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast(-src.num_dimensions()) || static_cast(src.num_dimensions()) <= info.axis); + ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast(-src.num_dimensions()) || + static_cast(src.num_dimensions()) <= info.axis); - const size_t actual_axis = static_cast(wrap_around(info.axis, static_cast(src.num_dimensions()))); + const size_t actual_axis = static_cast(wrap_around(info.axis, static_cast(src.num_dimensions()))); const bool needs_permute = actual_axis != 0; - if(needs_permute) + if (needs_permute) { - const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector); - TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape)); + const PermutationVector permutation_vector = + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); + const TensorShape permuted_shape = + misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector); + TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector)); TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector)); @@ -122,9 +135,14 @@ Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const TensorShape max_sum_shape = src.tensor_shape(); max_sum_shape.set(0, 1); TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true)); - TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true)); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum)); + TensorInfo tensor_info_sum(src.clone() + ->set_tensor_shape(max_sum_shape) + .set_data_type(tmp_data_type) + .set_quantization_info(QuantizationInfo()) + .set_is_resizable(true)); + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum)); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info)); return Status{}; @@ -139,10 +157,12 @@ void ClSoftmax::run(ITensorPack &tensors) CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false); CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false); - CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false); - CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false); + CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, + false); + CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, + false); - if(_needs_permute) + if (_needs_permute) { ITensorPack pack; pack.add_const_tensor(TensorType::ACL_SRC, src); @@ -152,7 +172,7 @@ void ClSoftmax::run(ITensorPack &tensors) ITensorPack sum_pack; ITensorPack norm_pack; - if(_needs_permute) + if (_needs_permute) { sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get()); norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get()); @@ -172,7 +192,7 @@ void ClSoftmax::run(ITensorPack &tensors) CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false); CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false); - if(_needs_permute) + if (_needs_permute) { ITensorPack pack; pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get()); @@ -186,4 +206,4 @@ experimental::MemoryRequirements ClSoftmax::workspace() const return _aux_mem; } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h index 6c9af585d6..6c2aaaea80 100644 --- a/src/gpu/cl/operators/ClSoftmax.h +++ b/src/gpu/cl/operators/ClSoftmax.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_SOFTMAX_H #include "arm_compute/runtime/CL/CLTensor.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -52,7 +53,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &dst, + const SoftmaxKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClSoftmax::configure() @@ -61,7 +65,7 @@ public: */ static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: @@ -79,7 +83,7 @@ private: std::unique_ptr _permute_output; std::unique_ptr _max_shift_exp_sum_kernel; std::unique_ptr _norm_kernel; - bool _needs_permute{ false }; + bool _needs_permute{false}; TensorInfo _max_info; TensorInfo _sum_info; @@ -90,6 +94,6 @@ private: experimental::MemoryRequirements _aux_mem{}; }; -} // opencl -} // arm_compute -#endif /* ARM_COMPUTE_CL_SOFTMAX_H */ \ No newline at end of file +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_SOFTMAX_H */ diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp index 53be04a70f..5c6d0c3184 100644 --- a/src/gpu/cl/operators/ClSub.cpp +++ b/src/gpu/cl/operators/ClSub.cpp @@ -23,17 +23,20 @@ */ #include "src/gpu/cl/operators/ClSub.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClElementwiseKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +void ClSub::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info); auto k = std::make_unique(); @@ -41,8 +44,11 @@ void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1 _kernel = std::move(k); } -Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status ClSub::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info); } diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h index 7eac437143..6a97275b86 100644 --- a/src/gpu/cl/operators/ClSub.h +++ b/src/gpu/cl/operators/ClSub.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_SUB_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -65,7 +66,11 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -73,7 +78,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy, + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp index 26feffe2b9..28da0d640a 100644 --- a/src/gpu/cl/operators/ClTranspose.cpp +++ b/src/gpu/cl/operators/ClTranspose.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClTranspose.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClTransposeKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl @@ -45,4 +44,4 @@ Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) return kernels::ClTransposeKernel::validate(src, dst); } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClTransposedConvolution.cpp b/src/gpu/cl/operators/ClTransposedConvolution.cpp index 90dbe7f291..cec438faeb 100644 --- a/src/gpu/cl/operators/ClTransposedConvolution.cpp +++ b/src/gpu/cl/operators/ClTransposedConvolution.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/common/utils/Log.h" #include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h" @@ -32,8 +33,12 @@ namespace arm_compute { namespace opencl { -void ClTransposedConvolution::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info) +void ClTransposedConvolution::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, deconv_info); @@ -43,10 +48,14 @@ void ClTransposedConvolution::configure(const CLCompileContext &compile_context, _transposed_conv_kernel = std::move(kernel_object); } -Status ClTransposedConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, - const ITensorInfo *output, const PadStrideInfo &deconv_info) +Status ClTransposedConvolution::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &deconv_info) { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info)); return Status{}; } diff --git a/src/gpu/cl/operators/ClTransposedConvolution.h b/src/gpu/cl/operators/ClTransposedConvolution.h index 58ebc689ed..660c4f85c1 100644 --- a/src/gpu/cl/operators/ClTransposedConvolution.h +++ b/src/gpu/cl/operators/ClTransposedConvolution.h @@ -68,23 +68,30 @@ public: * @param[in] deconv_info Contains padding and stride information described in @ref PadStrideInfo. * */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClTransposedConvolution::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, - const ITensorInfo *output, const PadStrideInfo &deconv_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &deconv_info); // Inherited method overridden void run(ITensorPack &tensors) override; private: - std::unique_ptr _transposed_conv_kernel{ nullptr }; + std::unique_ptr _transposed_conv_kernel{nullptr}; }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */ diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp index b4163a5986..8ec96b247e 100644 --- a/src/gpu/cl/operators/ClWinogradConv2d.cpp +++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp @@ -24,20 +24,19 @@ #include "src/gpu/cl/operators/ClWinogradConv2d.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h" #include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h" #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h" #include "src/gpu/cl/utils/ClAuxTensorHandler.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" using namespace arm_compute::experimental; @@ -55,15 +54,16 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height); // Check if the input spatial dimensions are smaller than 4 - const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW); + const bool is_input_lt4_nchw = + (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW); - if(kernel_max_dim == 3U) + if (kernel_max_dim == 3U) { - if(kernel_dims == Size2D(3U, 3U)) + if (kernel_dims == Size2D(3U, 3U)) { output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U); } - else if(kernel_dims == Size2D(3U, 1U)) + else if (kernel_dims == Size2D(3U, 1U)) { output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U); } @@ -72,15 +72,13 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U); } } - else if(kernel_max_dim == 5U) + else if (kernel_max_dim == 5U) { - output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, - kernel_dims.height == 1 ? 1U : 4U); + output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, kernel_dims.height == 1 ? 1U : 4U); } - else if(kernel_max_dim == 7U) + else if (kernel_max_dim == 7U) { - output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, - kernel_dims.height == 1 ? 1U : 2U); + output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, kernel_dims.height == 1 ? 1U : 2U); } return output_tile; @@ -91,11 +89,9 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz // Check if we want to configure a Winograd configuration which requires fast math using WinogradConfiguration = std::pair, std::pair>; - std::vector fast_math_winograd = - { + std::vector fast_math_winograd = { WinogradConfiguration(std::pair(4, 4), std::pair(5, 5)), - WinogradConfiguration(std::pair(2, 2), std::pair(7, 7)) - }; + WinogradConfiguration(std::pair(2, 2), std::pair(7, 7))}; auto p = std::make_pair(std::pair(output_tile.width, output_tile.height), std::pair(kernel_size.width, kernel_size.height)); @@ -103,8 +99,13 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end(); } -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { // Get indeces for the width and height const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); @@ -115,41 +116,49 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]); const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + ((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), + "Winograd only supports padding up to half kernel size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + ((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), + "Winograd only supports padding up to half kernel size"); // Check if the Winograd configuration requires fast math - if(!enable_fast_math) + if (!enable_fast_math) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. - ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. + ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), + "This Winograd configuration requires enable_fast_math=true"); } - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - src->data_layout()); + const WinogradInfo winograd_info = + WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout()); // Validate input transform - const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); - const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape); + const TensorShape input0_shape = + misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); + const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info)); // Validate filter transform - const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); - const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); + const TensorShape input1_shape = + misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); + const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info)); // Validate batched matrix multiply TensorShape batched_mm_output_shape = input0.tensor_shape(); batched_mm_output_shape[0] = input1.tensor_shape()[0]; const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, - GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)))); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, + GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, + GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)))); // Configure output transform - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info)); return Status{}; } @@ -171,8 +180,14 @@ ClWinogradConv2d::ClWinogradConv2d() ClWinogradConv2d::~ClWinogradConv2d() = default; -void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) +void ClWinogradConv2d::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math); @@ -187,50 +202,53 @@ void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITenso const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); // Check if the Winograd configuration requires fast math - if(!enable_fast_math) + if (!enable_fast_math) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. - ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, + DataType::F32); //disable winograd for fp16 if fast math is false. + ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), + "This Winograd configuration requires enable_fast_math=true"); } - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - src->data_layout()); + const WinogradInfo winograd_info = + WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout()); _is_prepared = false; // Configure input transform _input_transform->configure(compile_context, src, &_input0, winograd_info); - _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue()); + _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, + PixelValue()); // Configure filter transform _filter_transform->configure(compile_context, weights, &_input1, winograd_info); // Configure batched matrix multiply - _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, - false, false, - GEMMLowpOutputStageInfo(), - (src->data_type() == DataType::F16))); + _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, + GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, + GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))); // Configure output transform _output_transform->set_target(CLScheduler::get().target()); _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info); - _aux_mem = _batched_mm.workspace(); - const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r) - { - return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); - }) ? - MemoryLifetime::Prepare : - MemoryLifetime::Persistent; + _aux_mem = _batched_mm.workspace(); + const MemoryLifetime wino_wei_lifetm = + std::any_of(std::begin(_aux_mem), std::end(_aux_mem), + [](const auto &r) { return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); }) + ? MemoryLifetime::Prepare + : MemoryLifetime::Persistent; _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size())); _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size())); _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size())); } -Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status ClWinogradConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); return Status{}; @@ -251,10 +269,9 @@ void ClWinogradConv2d::run(ITensorPack &tensors) prepare(tensors); // Run input transform - ITensorPack pack_it - { - { TensorType::ACL_SRC, src }, - { TensorType::ACL_DST, input0.get() }, + ITensorPack pack_it{ + {TensorType::ACL_SRC, src}, + {TensorType::ACL_DST, input0.get()}, }; CLScheduler::get().enqueue_op(_border_handler, pack_it, false); CLScheduler::get().enqueue_op(*_input_transform, pack_it, false); @@ -263,31 +280,31 @@ void ClWinogradConv2d::run(ITensorPack &tensors) ITensorPack pack_mm = tensors; pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get()); pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get()); - is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get()); + is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) + : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get()); _batched_mm.run(pack_mm); // Run output transform - ITensorPack pack_ot - { - { TensorType::ACL_SRC_0, batched_mm_output.get() }, - { TensorType::ACL_SRC_1, biases }, - { TensorType::ACL_DST, dst }, + ITensorPack pack_ot{ + {TensorType::ACL_SRC_0, batched_mm_output.get()}, + {TensorType::ACL_SRC_1, biases}, + {TensorType::ACL_DST, dst}, }; CLScheduler::get().enqueue_op(*_output_transform, pack_ot); } void ClWinogradConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { - auto weights = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto weights = + utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); ICLTensor *in1_aux = utils::cast::polymorphic_downcast(tensors.get_tensor(offset_int_vec(3))); CLAuxTensorHandler input1(_input1, *in1_aux); - ITensorPack pack_ft - { - { TensorType::ACL_SRC, weights }, - { TensorType::ACL_DST, input1.get() }, + ITensorPack pack_ft{ + {TensorType::ACL_SRC, weights}, + {TensorType::ACL_DST, input1.get()}, }; // Run filter transform and mark original weights as unused CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false); @@ -308,4 +325,4 @@ experimental::MemoryRequirements ClWinogradConv2d::workspace() const return _aux_mem; } } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h index eb2f7a72b2..54ec1a1737 100644 --- a/src/gpu/cl/operators/ClWinogradConv2d.h +++ b/src/gpu/cl/operators/ClWinogradConv2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_WINOGRADCONV2D_H #include "arm_compute/runtime/CL/CLTensor.h" + #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -41,7 +42,7 @@ namespace kernels class ClWinogradInputTransformKernel; class ClWinogradFilterTransformKernel; class ClWinogradOutputTransformKernel; -} // kernels +} // namespace kernels /** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels: * * -# @ref kernels::ClWinogradInputTransformKernel @@ -93,20 +94,31 @@ public: * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation * available which may introduce a drop of accuracy as well. Default is false */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); /** Static function to check if given info will lead to a valid configuration * * Similar to ClWinogradConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited method overridden - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/gpu/cl/utils/ClAuxTensorHandler.h b/src/gpu/cl/utils/ClAuxTensorHandler.h index af383489a1..81dc3baef4 100644 --- a/src/gpu/cl/utils/ClAuxTensorHandler.h +++ b/src/gpu/cl/utils/ClAuxTensorHandler.h @@ -39,25 +39,26 @@ namespace opencl class CLAuxTensorHandler { public: - CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) + CLAuxTensorHandler( + int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) : _tensor() { - if(info.total_size() == 0) + if (info.total_size() == 0) { return; } _tensor.allocator()->soft_init(info); ICLTensor *packed_tensor = utils::cast::polymorphic_downcast(pack.get_tensor(slot_id)); - if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) + if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) { - if(!bypass_alloc) + if (!bypass_alloc) { _tensor.allocator()->allocate(); ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor"); } - if(pack_inject) + if (pack_inject) { pack.add_tensor(slot_id, &_tensor); _injected_tensor_pack = &pack; @@ -70,22 +71,21 @@ public: } } - CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor) - : _tensor() + CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor) : _tensor() { _tensor.allocator()->soft_init(info); - if(info.total_size() <= tensor.info()->total_size()) + if (info.total_size() <= tensor.info()->total_size()) { _tensor.allocator()->import_memory(tensor.cl_buffer()); } } - CLAuxTensorHandler(const CLAuxTensorHandler &) = delete; + CLAuxTensorHandler(const CLAuxTensorHandler &) = delete; CLAuxTensorHandler &operator=(const CLAuxTensorHandler) = delete; ~CLAuxTensorHandler() { - if(_injected_tensor_pack) + if (_injected_tensor_pack) { _injected_tensor_pack->remove_tensor(_injected_slot_id); } @@ -103,9 +103,9 @@ public: private: CLTensor _tensor{}; - ITensorPack *_injected_tensor_pack{ nullptr }; - int _injected_slot_id{ TensorType::ACL_UNKNOWN }; + ITensorPack *_injected_tensor_pack{nullptr}; + int _injected_slot_id{TensorType::ACL_UNKNOWN}; }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */ diff --git a/src/graph/DataLayerVisitor.cpp b/src/graph/DataLayerVisitor.cpp index 073ffd413d..f0fac25577 100644 --- a/src/graph/DataLayerVisitor.cpp +++ b/src/graph/DataLayerVisitor.cpp @@ -25,8 +25,8 @@ #include "arm_compute/core/Error.h" #include "arm_compute/graph/Graph.h" -#include "arm_compute/graph/TypePrinter.h" #include "arm_compute/graph/nodes/Nodes.h" +#include "arm_compute/graph/TypePrinter.h" namespace arm_compute { @@ -43,17 +43,14 @@ void add_convolution_layer_data(DataLayerVisitor::LayerData &layer_data, T &node layer_data["data_layout"] = to_string(layout); // Add padding info std::ostringstream padding; - padding << "[" << to_string(ps_info.pad_left()) << "," - << to_string(ps_info.pad_top()) << "," - << to_string(ps_info.pad_bottom()) << "," - << to_string(ps_info.pad_right()) << "]"; + padding << "[" << to_string(ps_info.pad_left()) << "," << to_string(ps_info.pad_top()) << "," + << to_string(ps_info.pad_bottom()) << "," << to_string(ps_info.pad_right()) << "]"; layer_data["pad"] = padding.str(); // Add stride info std::ostringstream stride; - stride << "[" << to_string(ps_info.stride().first) << "," - << to_string(ps_info.stride().second) << "]"; + stride << "[" << to_string(ps_info.stride().first) << "," << to_string(ps_info.stride().second) << "]"; layer_data["stride"] = stride.str(); @@ -68,12 +65,12 @@ void add_convolution_layer_data(DataLayerVisitor::LayerData &layer_data, T &node // Change input names for weights / bias (if applicable) // Assumes input(1) is weights and input(2) is bias - if(layer_data.count("input_shape1")) + if (layer_data.count("input_shape1")) { layer_data["weights_shape"] = layer_data["input_shape1"]; layer_data.erase("input_shape1"); } - if(layer_data.count("input_shape2")) + if (layer_data.count("input_shape2")) { layer_data["bias_shape"] = layer_data["input_shape2"]; layer_data.erase("input_shape2"); @@ -92,16 +89,17 @@ template void add_generic_layer_data(DataLayerVisitor::LayerData &layer_data, T &node) { // Loop over each input tensor - for(size_t tensor_no = 0; tensor_no < node.num_inputs(); ++tensor_no) + for (size_t tensor_no = 0; tensor_no < node.num_inputs(); ++tensor_no) { // Add input tensor shapes - if(node.input(tensor_no) != nullptr) + if (node.input(tensor_no) != nullptr) { - layer_data["input_shape" + to_string(tensor_no)] = "[" + to_string(node.input(tensor_no)->desc().shape) + "]"; + layer_data["input_shape" + to_string(tensor_no)] = + "[" + to_string(node.input(tensor_no)->desc().shape) + "]"; } } // Add output tensor shape - if(node.output(0) != nullptr) + if (node.output(0) != nullptr) { layer_data["output_shape0"] = "[" + to_string(node.output(0)->desc().shape) + "]"; } diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp index 4ce53589d4..3ae83f2e80 100644 --- a/src/graph/Graph.cpp +++ b/src/graph/Graph.cpp @@ -34,24 +34,24 @@ Graph::Graph(GraphID id, std::string name) bool Graph::remove_node(NodeID nid) { - if(nid >= _nodes.size()) + if (nid >= _nodes.size()) { return false; } std::unique_ptr &node = _nodes[nid]; - if(node) + if (node) { // Remove input connections - for(auto &input_eid : node->_input_edges) + for (auto &input_eid : node->_input_edges) { remove_connection(input_eid); } // Remove output connections std::set output_edges_copy = node->output_edges(); - for(auto &output_eid : output_edges_copy) + for (auto &output_eid : output_edges_copy) { remove_connection(output_eid); } @@ -71,8 +71,10 @@ EdgeID Graph::add_connection(NodeID source, size_t source_idx, NodeID sink, size arm_compute::lock_guard lock(_mtx); // Check if node index is valid, if node exists and finally if the connection index is valid - ARM_COMPUTE_ERROR_ON((source >= _nodes.size()) || (_nodes[source] == nullptr) || (source_idx >= _nodes[source]->num_outputs())); - ARM_COMPUTE_ERROR_ON((sink >= _nodes.size()) || (_nodes[sink] == nullptr) || (sink_idx >= _nodes[sink]->num_inputs())); + ARM_COMPUTE_ERROR_ON((source >= _nodes.size()) || (_nodes[source] == nullptr) || + (source_idx >= _nodes[source]->num_outputs())); + ARM_COMPUTE_ERROR_ON((sink >= _nodes.size()) || (_nodes[sink] == nullptr) || + (sink_idx >= _nodes[sink]->num_inputs())); // Get nodes std::unique_ptr &source_node = _nodes[source]; @@ -80,23 +82,25 @@ EdgeID Graph::add_connection(NodeID source, size_t source_idx, NodeID sink, size // Check for duplicate connections (Check only sink node) Edge *sink_node_edge = sink_node->input_edge(sink_idx); - if((sink_node_edge != nullptr) && (sink_node_edge->producer_id() == source) && (sink_node_edge->producer_idx() == source_idx) - && (sink_node_edge->consumer_id() == sink) && (sink_node_edge->consumer_idx() == sink_idx)) + if ((sink_node_edge != nullptr) && (sink_node_edge->producer_id() == source) && + (sink_node_edge->producer_idx() == source_idx) && (sink_node_edge->consumer_id() == sink) && + (sink_node_edge->consumer_idx() == sink_idx)) { return sink_node_edge->id(); } // Check if there is already a tensor associated with output if not create one TensorID tid = source_node->output_id(source_idx); - if(tid == NullTensorID) + if (tid == NullTensorID) { tid = create_tensor(); } std::unique_ptr &tensor = _tensors[tid]; // Create connections - EdgeID eid = _edges.size(); - auto connection = std::make_unique(eid, source_node.get(), source_idx, sink_node.get(), sink_idx, tensor.get()); + EdgeID eid = _edges.size(); + auto connection = + std::make_unique(eid, source_node.get(), source_idx, sink_node.get(), sink_idx, tensor.get()); _edges.push_back(std::move(connection)); // Add connections to source and sink nodes @@ -117,7 +121,7 @@ EdgeID Graph::add_connection(NodeID source, size_t source_idx, NodeID sink, size bool Graph::remove_connection(EdgeID eid) { - if(eid >= _edges.size()) + if (eid >= _edges.size()) { return false; } @@ -125,22 +129,22 @@ bool Graph::remove_connection(EdgeID eid) std::unique_ptr &edge = _edges[eid]; // Remove node connections - if(edge != nullptr) + if (edge != nullptr) { // Get tensor bound to the edge - if(edge->tensor() != nullptr) + if (edge->tensor() != nullptr) { edge->tensor()->unbind_edge(eid); } // Remove edges from source node - if(edge->producer() != nullptr) + if (edge->producer() != nullptr) { edge->producer()->_output_edges.erase(eid); } // Remove edges from sink node - if((edge->consumer() != nullptr) && (edge->consumer_idx() < edge->consumer()->_input_edges.size())) + if ((edge->consumer() != nullptr) && (edge->consumer_idx() < edge->consumer()->_input_edges.size())) { edge->consumer()->_input_edges[edge->consumer_idx()] = EmptyEdgeID; } @@ -231,4 +235,4 @@ Tensor *Graph::tensor(TensorID id) return (id >= _tensors.size()) ? nullptr : _tensors[id].get(); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp index 7e5d3133d1..eab91b2347 100644 --- a/src/graph/GraphBuilder.cpp +++ b/src/graph/GraphBuilder.cpp @@ -24,10 +24,10 @@ #include "arm_compute/graph/GraphBuilder.h" #include "arm_compute/core/utils/DataTypeUtils.h" -#include "arm_compute/graph/Graph.h" -#include "arm_compute/graph/Utils.h" #include "arm_compute/graph/algorithms/TopologicalSort.h" +#include "arm_compute/graph/Graph.h" #include "arm_compute/graph/nodes/Nodes.h" +#include "arm_compute/graph/Utils.h" #include "support/ToolchainSupport.h" @@ -41,7 +41,8 @@ inline void check_nodeidx_pair(const NodeIdxPair &pair, const Graph &g) { ARM_COMPUTE_UNUSED(pair); ARM_COMPUTE_UNUSED(g); - ARM_COMPUTE_ERROR_ON((pair.node_id >= g.nodes().size()) || (g.node((pair).node_id) == nullptr) || (pair.index >= g.node(pair.node_id)->num_outputs())); + ARM_COMPUTE_ERROR_ON((pair.node_id >= g.nodes().size()) || (g.node((pair).node_id) == nullptr) || + (pair.index >= g.node(pair.node_id)->num_outputs())); } Status set_node_params(Graph &g, NodeID nid, NodeParams ¶ms) @@ -67,7 +68,8 @@ Status set_accessor_on_node(Graph &g, NodeID nid, bool is_output, size_t idx, IT return Status{}; } -NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &name, const TensorDescriptor &desc, ITensorAccessorUPtr accessor) +NodeID add_const_node_with_name( + Graph &g, NodeParams params, const std::string &name, const TensorDescriptor &desc, ITensorAccessorUPtr accessor) { params.name = params.name.empty() ? "" : params.name + name; auto nid = GraphBuilder::add_const_node(g, params, desc, std::move(accessor)); @@ -76,7 +78,7 @@ NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string & } template -NodeID create_simple_single_input_output_node(Graph &g, NodeParams ¶ms, NodeIdxPair input, Args &&... args) +NodeID create_simple_single_input_output_node(Graph &g, NodeParams ¶ms, NodeIdxPair input, Args &&...args) { check_nodeidx_pair(input, g); @@ -88,14 +90,17 @@ NodeID create_simple_single_input_output_node(Graph &g, NodeParams ¶ms, Node } template -NodeID create_simple_multiple_input_single_output_node(Graph &g, NodeParams ¶ms, const std::vector &inputs, Args &&... args) +NodeID create_simple_multiple_input_single_output_node(Graph &g, + NodeParams ¶ms, + const std::vector &inputs, + Args &&...args) { ARM_COMPUTE_ERROR_ON(inputs.size() == 0); NodeID nid = g.add_node(std::forward(args)...); unsigned int i = 0; - for(const auto &input : inputs) + for (const auto &input : inputs) { check_nodeidx_pair(input, g); g.add_connection(input.node_id, input.index, nid, i++); @@ -106,7 +111,8 @@ NodeID create_simple_multiple_input_single_output_node(Graph &g, NodeParams &par } } // namespace -NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor) +NodeID +GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor) { auto nid = g.add_node(desc); set_node_params(g, nid, params); @@ -114,7 +120,8 @@ NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDes return nid; } -NodeID GraphBuilder::add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor) +NodeID +GraphBuilder::add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor) { auto nid = g.add_node(desc); set_node_params(g, nid, params); @@ -134,21 +141,35 @@ NodeID GraphBuilder::add_output_node(Graph &g, NodeParams params, NodeIdxPair in return nid; } -NodeID GraphBuilder::add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info, +NodeID GraphBuilder::add_activation_node(Graph &g, + NodeParams params, + NodeIdxPair input, + ActivationLayerInfo act_info, const QuantizationInfo &out_quant_info) { return create_simple_single_input_output_node(g, params, input, act_info, out_quant_info); } -NodeID GraphBuilder::add_arg_min_max_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, unsigned int axis, - DataType out_data_type, const QuantizationInfo &out_quant_info) +NodeID GraphBuilder::add_arg_min_max_node(Graph &g, + NodeParams params, + NodeIdxPair input, + ReductionOperation op, + unsigned int axis, + DataType out_data_type, + const QuantizationInfo &out_quant_info) { - return create_simple_single_input_output_node(g, params, input, op, axis, out_data_type, out_quant_info); + return create_simple_single_input_output_node(g, params, input, op, axis, out_data_type, + out_quant_info); } -NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon, - ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr var_accessor, - ITensorAccessorUPtr beta_accessor, ITensorAccessorUPtr gamma_accessor) +NodeID GraphBuilder::add_batch_normalization_node(Graph &g, + NodeParams params, + NodeIdxPair input, + float epsilon, + ITensorAccessorUPtr mean_accessor, + ITensorAccessorUPtr var_accessor, + ITensorAccessorUPtr beta_accessor, + ITensorAccessorUPtr gamma_accessor) { check_nodeidx_pair(input, g); @@ -168,14 +189,14 @@ NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, N // Create beta node NodeID beta_nid = EmptyNodeID; - if(has_beta) + if (has_beta) { beta_nid = add_const_node_with_name(g, params, "Beta", common_desc, std::move(beta_accessor)); } // Create gamma node NodeID gamma_nid = EmptyNodeID; - if(has_gamma) + if (has_gamma) { gamma_nid = add_const_node_with_name(g, params, "Gamma", common_desc, std::move(gamma_accessor)); } @@ -185,11 +206,11 @@ NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, N g.add_connection(input.node_id, input.index, batch_norm_nid, 0); g.add_connection(mean_nid, 0, batch_norm_nid, 1); g.add_connection(var_nid, 0, batch_norm_nid, 2); - if(has_beta) + if (has_beta) { g.add_connection(beta_nid, 0, batch_norm_nid, 3); } - if(has_gamma) + if (has_gamma) { g.add_connection(gamma_nid, 0, batch_norm_nid, 4); } @@ -198,7 +219,8 @@ NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, N return batch_norm_nid; } -NodeID GraphBuilder::add_bounding_box_transform_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info) +NodeID GraphBuilder::add_bounding_box_transform_node( + Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info) { check_nodeidx_pair(input, g); check_nodeidx_pair(deltas, g); @@ -217,10 +239,17 @@ NodeID GraphBuilder::add_channel_shuffle_node(Graph &g, NodeParams params, NodeI return create_simple_single_input_output_node(g, params, input, num_groups); } -NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, - Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo conv_info, - unsigned int num_groups, ConvolutionMethod method, FastMathHint fast_math_hint, - ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, +NodeID GraphBuilder::add_convolution_node(Graph &g, + NodeParams params, + NodeIdxPair input, + Size2D kernel_spatial_extend, + unsigned int depth, + PadStrideInfo conv_info, + unsigned int num_groups, + ConvolutionMethod method, + FastMathHint fast_math_hint, + ITensorAccessorUPtr weights_accessor, + ITensorAccessorUPtr bias_accessor, const QuantizationInfo &weights_quant_info, const QuantizationInfo &out_quant_info) { @@ -241,7 +270,7 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL), get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) / num_groups); w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::BATCHES), depth); - if(!weights_quant_info.empty()) + if (!weights_quant_info.empty()) { w_desc.quant_info = weights_quant_info; } @@ -250,11 +279,11 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa // Create bias nodes NodeID b_nid = EmptyNodeID; - if(has_bias) + if (has_bias) { TensorDescriptor b_desc = input_tensor_desc; b_desc.shape = TensorShape(depth); - if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type)) + if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type)) { b_desc.data_type = DataType::S32; } @@ -265,7 +294,7 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa NodeID conv_nid = g.add_node(conv_info, num_groups, method, fast_math_hint, out_quant_info); g.add_connection(input.node_id, input.index, conv_nid, 0); g.add_connection(w_nid, 0, conv_nid, 1); - if(has_bias) + if (has_bias) { g.add_connection(b_nid, 0, conv_nid, 2); } @@ -274,8 +303,12 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa return conv_nid; } -NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdxPair input, - Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo deconv_info, +NodeID GraphBuilder::add_deconvolution_node(Graph &g, + NodeParams params, + NodeIdxPair input, + Size2D kernel_spatial_extend, + unsigned int depth, + PadStrideInfo deconv_info, ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor) { @@ -301,11 +334,11 @@ NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdx // Create bias nodes NodeID b_nid = EmptyNodeID; - if(has_bias) + if (has_bias) { TensorDescriptor b_desc = input_tensor_desc; b_desc.shape = TensorShape(depth); - if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type)) + if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type)) { b_desc.data_type = DataType::S32; } @@ -313,10 +346,10 @@ NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdx } // Create convolution node and connect - NodeID deconv_nid = g.add_node(descriptors::DeconvolutionLayerDescriptor{ deconv_info }); + NodeID deconv_nid = g.add_node(descriptors::DeconvolutionLayerDescriptor{deconv_info}); g.add_connection(input.node_id, input.index, deconv_nid, 0); g.add_connection(w_nid, 0, deconv_nid, 1); - if(has_bias) + if (has_bias) { g.add_connection(b_nid, 0, deconv_nid, 2); } @@ -325,14 +358,26 @@ NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdx return deconv_nid; } -NodeID GraphBuilder::add_concatenate_node(Graph &g, NodeParams params, const std::vector &inputs, const descriptors::ConcatLayerDescriptor &concat_descriptor) +NodeID GraphBuilder::add_concatenate_node(Graph &g, + NodeParams params, + const std::vector &inputs, + const descriptors::ConcatLayerDescriptor &concat_descriptor) { - return create_simple_multiple_input_single_output_node(g, params, inputs, inputs.size(), concat_descriptor); + return create_simple_multiple_input_single_output_node(g, params, inputs, inputs.size(), + concat_descriptor); } -NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend, - PadStrideInfo conv_info, int depth_multiplier, DepthwiseConvolutionMethod method, - ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo &quant_info, const QuantizationInfo &out_quant_info) +NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, + NodeParams params, + NodeIdxPair input, + Size2D kernel_spatial_extend, + PadStrideInfo conv_info, + int depth_multiplier, + DepthwiseConvolutionMethod method, + ITensorAccessorUPtr weights_accessor, + ITensorAccessorUPtr bias_accessor, + const QuantizationInfo &quant_info, + const QuantizationInfo &out_quant_info) { check_nodeidx_pair(input, g); ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0)); @@ -349,7 +394,7 @@ NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height); w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL), get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier); - if(!quant_info.empty()) + if (!quant_info.empty()) { w_desc.quant_info = quant_info; } @@ -358,12 +403,13 @@ NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, // Create bias nodes NodeID b_nid = EmptyNodeID; - if(has_bias) + if (has_bias) { TensorDescriptor b_desc = input_tensor_desc; - b_desc.shape = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier); + b_desc.shape = + TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier); - if(is_data_type_quantized_asymmetric(b_desc.data_type)) + if (is_data_type_quantized_asymmetric(b_desc.data_type)) { b_desc.data_type = DataType::S32; } @@ -375,7 +421,7 @@ NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeID conv_nid = g.add_node(conv_info, depth_multiplier, method, out_quant_info); g.add_connection(input.node_id, input.index, conv_nid, 0); g.add_connection(w_nid, 0, conv_nid, 1); - if(has_bias) + if (has_bias) { g.add_connection(b_nid, 0, conv_nid, 2); } @@ -394,7 +440,12 @@ NodeID GraphBuilder::add_dequantization_node(Graph &g, NodeParams params, NodeId return create_simple_single_input_output_node(g, params, input); } -NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, const DetectionOutputLayerInfo &detect_info) +NodeID GraphBuilder::add_detection_output_node(Graph &g, + NodeParams params, + NodeIdxPair input_loc, + NodeIdxPair input_conf, + NodeIdxPair input_priorbox, + const DetectionOutputLayerInfo &detect_info) { check_nodeidx_pair(input_loc, g); check_nodeidx_pair(input_conf, g); @@ -411,18 +462,24 @@ NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, Node return detect_nid; } -NodeID GraphBuilder::add_detection_post_process_node(Graph &g, NodeParams params, NodeIdxPair input_box_encoding, NodeIdxPair input_class_prediction, const DetectionPostProcessLayerInfo &detect_info, - ITensorAccessorUPtr anchors_accessor, const QuantizationInfo &anchor_quant_info) +NodeID GraphBuilder::add_detection_post_process_node(Graph &g, + NodeParams params, + NodeIdxPair input_box_encoding, + NodeIdxPair input_class_prediction, + const DetectionPostProcessLayerInfo &detect_info, + ITensorAccessorUPtr anchors_accessor, + const QuantizationInfo &anchor_quant_info) { check_nodeidx_pair(input_box_encoding, g); check_nodeidx_pair(input_class_prediction, g); // Get input tensor descriptor - const TensorDescriptor input_box_encoding_tensor_desc = get_tensor_descriptor(g, g.node(input_box_encoding.node_id)->outputs()[0]); + const TensorDescriptor input_box_encoding_tensor_desc = + get_tensor_descriptor(g, g.node(input_box_encoding.node_id)->outputs()[0]); // Calculate anchor descriptor TensorDescriptor anchor_desc = input_box_encoding_tensor_desc; - if(!anchor_quant_info.empty()) + if (!anchor_quant_info.empty()) { anchor_desc.quant_info = anchor_quant_info; } @@ -446,12 +503,13 @@ NodeID GraphBuilder::add_dummy_node(Graph &g, NodeParams params, NodeIdxPair inp return create_simple_single_input_output_node(g, params, input, shape); } -NodeID GraphBuilder::add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation) +NodeID GraphBuilder::add_elementwise_node( + Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation) { check_nodeidx_pair(input0, g); check_nodeidx_pair(input1, g); - NodeID nid = g.add_node(descriptors::EltwiseLayerDescriptor{ operation }); + NodeID nid = g.add_node(descriptors::EltwiseLayerDescriptor{operation}); g.add_connection(input0.node_id, input0.index, nid, 0); g.add_connection(input1.node_id, input1.index, nid, 1); @@ -466,9 +524,15 @@ NodeID GraphBuilder::add_flatten_node(Graph &g, NodeParams params, NodeIdxPair i return create_simple_single_input_output_node(g, params, input); } -NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs, - NodeID weights_nid, NodeID bias_nid, - const FullyConnectedLayerInfo fc_info, const QuantizationInfo &out_quant_info, FastMathHint fast_math_hint) +NodeID GraphBuilder::add_fully_connected_layer(Graph &g, + NodeParams params, + NodeIdxPair input, + unsigned int num_outputs, + NodeID weights_nid, + NodeID bias_nid, + const FullyConnectedLayerInfo fc_info, + const QuantizationInfo &out_quant_info, + FastMathHint fast_math_hint) { check_nodeidx_pair(input, g); ARM_COMPUTE_ERROR_ON(num_outputs == 0); @@ -483,7 +547,7 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node NodeID fc_nid = g.add_node(num_outputs, out_quant_info, fc_info, fast_math_hint); g.add_connection(input.node_id, input.index, fc_nid, 0); g.add_connection(weights_nid, 0, fc_nid, 1); - if(has_bias) + if (has_bias) { g.add_connection(bias_nid, 0, fc_nid, 2); } @@ -493,10 +557,16 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node return fc_nid; } -NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs, - ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, +NodeID GraphBuilder::add_fully_connected_layer(Graph &g, + NodeParams params, + NodeIdxPair input, + unsigned int num_outputs, + ITensorAccessorUPtr weights_accessor, + ITensorAccessorUPtr bias_accessor, const FullyConnectedLayerInfo fc_info, - const QuantizationInfo &weights_quant_info, const QuantizationInfo &out_quant_info, FastMathHint fast_math_hint) + const QuantizationInfo &weights_quant_info, + const QuantizationInfo &out_quant_info, + FastMathHint fast_math_hint) { check_nodeidx_pair(input, g); ARM_COMPUTE_ERROR_ON(num_outputs == 0); @@ -507,16 +577,17 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]); // Create weights node - TensorDescriptor w_desc = FullyConnectedLayerNode::compute_weights_descriptor(input_tensor_desc, num_outputs, fc_info, weights_quant_info); + TensorDescriptor w_desc = FullyConnectedLayerNode::compute_weights_descriptor(input_tensor_desc, num_outputs, + fc_info, weights_quant_info); NodeID w_nid = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor)); // Create bias nodes NodeID b_nid = EmptyNodeID; - if(has_bias) + if (has_bias) { TensorDescriptor b_desc = input_tensor_desc; b_desc.shape = TensorShape(num_outputs); - if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type)) + if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type)) { b_desc.data_type = DataType::S32; } @@ -527,7 +598,7 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node NodeID fc_nid = g.add_node(num_outputs, out_quant_info, fc_info, fast_math_hint); g.add_connection(input.node_id, input.index, fc_nid, 0); g.add_connection(w_nid, 0, fc_nid, 1); - if(has_bias) + if (has_bias) { g.add_connection(b_nid, 0, fc_nid, 2); } @@ -537,7 +608,12 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node return fc_nid; } -NodeID GraphBuilder::add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas, NodeIdxPair anchors, GenerateProposalsInfo info) +NodeID GraphBuilder::add_generate_proposals_node(Graph &g, + NodeParams params, + NodeIdxPair scores, + NodeIdxPair deltas, + NodeIdxPair anchors, + GenerateProposalsInfo info) { check_nodeidx_pair(scores, g); check_nodeidx_pair(deltas, g); @@ -558,13 +634,14 @@ NodeID GraphBuilder::add_l2_normalize_node(Graph &g, NodeParams params, NodeIdxP return create_simple_single_input_output_node(g, params, input, axis, epsilon); } -NodeID GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info) +NodeID +GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info) { return create_simple_single_input_output_node(g, params, input, norm_info); } -NodeID GraphBuilder::add_normalize_planar_yuv_node(Graph &g, NodeParams params, NodeIdxPair input, - ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor) +NodeID GraphBuilder::add_normalize_planar_yuv_node( + Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor) { check_nodeidx_pair(input, g); @@ -589,12 +666,14 @@ NodeID GraphBuilder::add_normalize_planar_yuv_node(Graph &g, NodeParams params, return norm_planar_yuv_nid; } -NodeID GraphBuilder::add_pad_node(Graph &g, NodeParams params, NodeIdxPair input, const PaddingList &paddings, PixelValue pad_value) +NodeID GraphBuilder::add_pad_node( + Graph &g, NodeParams params, NodeIdxPair input, const PaddingList &paddings, PixelValue pad_value) { return create_simple_single_input_output_node(g, params, input, paddings, pad_value); } -NodeID GraphBuilder::add_permute_node(Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout) +NodeID GraphBuilder::add_permute_node( + Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout) { return create_simple_single_input_output_node(g, params, input, perm, layout); } @@ -618,12 +697,18 @@ NodeID GraphBuilder::add_pooling_node(Graph &g, NodeParams params, NodeIdxPair i return create_simple_single_input_output_node(g, params, input, pool_info); } -NodeID GraphBuilder::add_print_node(Graph &g, NodeParams params, NodeIdxPair input, std::ostream &stream, const IOFormatInfo &format_info, const std::function transform) +NodeID GraphBuilder::add_print_node(Graph &g, + NodeParams params, + NodeIdxPair input, + std::ostream &stream, + const IOFormatInfo &format_info, + const std::function transform) { return create_simple_single_input_output_node(g, params, input, stream, format_info, transform); } -NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info) +NodeID GraphBuilder::add_priorbox_node( + Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info) { check_nodeidx_pair(input0, g); check_nodeidx_pair(input1, g); @@ -638,12 +723,16 @@ NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair return prior_nid; } -NodeID GraphBuilder::add_quantization_node(Graph &g, NodeParams params, NodeIdxPair input, const QuantizationInfo &out_quant_info) +NodeID GraphBuilder::add_quantization_node(Graph &g, + NodeParams params, + NodeIdxPair input, + const QuantizationInfo &out_quant_info) { return create_simple_single_input_output_node(g, params, input, out_quant_info); } -NodeID GraphBuilder::add_reduction_operation_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims) +NodeID GraphBuilder::add_reduction_operation_node( + Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims) { return create_simple_single_input_output_node(g, params, input, op, axis, keep_dims); } @@ -658,13 +747,14 @@ NodeID GraphBuilder::add_reshape_node(Graph &g, NodeParams params, NodeIdxPair i return create_simple_single_input_output_node(g, params, input, shape); } -NodeID GraphBuilder::add_resize_node(Graph &g, NodeParams params, NodeIdxPair input, InterpolationPolicy policy, - float width_scale, float height_scale) +NodeID GraphBuilder::add_resize_node( + Graph &g, NodeParams params, NodeIdxPair input, InterpolationPolicy policy, float width_scale, float height_scale) { return create_simple_single_input_output_node(g, params, input, policy, width_scale, height_scale); } -NodeID GraphBuilder::add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info) +NodeID GraphBuilder::add_roi_align_node( + Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info) { check_nodeidx_pair(input, g); check_nodeidx_pair(rois, g); @@ -678,7 +768,11 @@ NodeID GraphBuilder::add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair return nid; } -NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams ¶ms, NodeIdxPair input, ITensorAccessorUPtr mul_accessor, ITensorAccessorUPtr add_accessor) +NodeID GraphBuilder::add_scale_layer(Graph &g, + const NodeParams ¶ms, + NodeIdxPair input, + ITensorAccessorUPtr mul_accessor, + ITensorAccessorUPtr add_accessor) { check_nodeidx_pair(input, g); @@ -688,22 +782,23 @@ NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams ¶ms, NodeIdx // Create mul node TensorDescriptor mul_desc = input_tensor_desc; - const size_t C = input_tensor_desc.shape[get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL)]; + const size_t C = input_tensor_desc.shape[get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL)]; mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), 1); mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), 1); mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL), C); NodeID mul_const_nid = add_const_node_with_name(g, params, "Mul", mul_desc, std::move(mul_accessor)); - NodeIdxPair mul_const_nidxp = { mul_const_nid, 0 }; + NodeIdxPair mul_const_nidxp = {mul_const_nid, 0}; // Create add node TensorDescriptor add_desc = mul_desc; NodeID add_const_nid = add_const_node_with_name(g, params, "Add", add_desc, std::move(add_accessor)); - NodeIdxPair add_const_nidxp = { add_const_nid, 0 }; + NodeIdxPair add_const_nidxp = {add_const_nid, 0}; // Create node and connect - NodeID mul_node = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::Mul); - NodeIdxPair mulnode_nidxp = { mul_node, 0 }; - NodeID add_node = GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::Add); + NodeID mul_node = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::Mul); + NodeIdxPair mulnode_nidxp = {mul_node, 0}; + NodeID add_node = + GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::Add); return add_node; } @@ -713,17 +808,25 @@ NodeID GraphBuilder::add_softmax_node(Graph &g, NodeParams params, NodeIdxPair i return create_simple_single_input_output_node(g, params, input, beta); } -NodeID GraphBuilder::add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends) +NodeID +GraphBuilder::add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends) { return create_simple_single_input_output_node(g, params, input, starts, ends); } -NodeID GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis) +NodeID +GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis) { return create_simple_single_input_output_node(g, params, input, num_splits, axis); } -NodeID GraphBuilder::add_strided_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends, BiStrides &strides, StridedSliceLayerInfo info) +NodeID GraphBuilder::add_strided_slice_node(Graph &g, + NodeParams params, + NodeIdxPair input, + Coordinates &starts, + Coordinates &ends, + BiStrides &strides, + StridedSliceLayerInfo info) { return create_simple_single_input_output_node(g, params, input, starts, ends, strides, info); } @@ -770,7 +873,8 @@ NodeID GraphBuilder::add_yolo_node(Graph &g, NodeParams params, NodeIdxPair inpu g.add_connection(input.node_id, input.index, cls, 0); g.add_connection(cls, 0, cls_act, 0); - NodeID concat = g.add_node(3, descriptors::ConcatLayerDescriptor(DataLayoutDimension::CHANNEL)); + NodeID concat = + g.add_node(3, descriptors::ConcatLayerDescriptor(DataLayoutDimension::CHANNEL)); set_node_params(g, concat, params); g.add_connection(act_box, 0, concat, 0); g.add_connection(imm, 0, concat, 1); diff --git a/src/graph/GraphContext.cpp b/src/graph/GraphContext.cpp index 7b74c2fe0e..10850aa259 100644 --- a/src/graph/GraphContext.cpp +++ b/src/graph/GraphContext.cpp @@ -24,15 +24,14 @@ #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph.h" -#include "arm_compute/graph/Utils.h" #include "arm_compute/graph/backends/BackendRegistry.h" +#include "arm_compute/graph/Utils.h" namespace arm_compute { namespace graph { -GraphContext::GraphContext() - : _config(), _memory_managers(), _weights_managers() +GraphContext::GraphContext() : _config(), _memory_managers(), _weights_managers() { } @@ -56,7 +55,7 @@ void GraphContext::set_config(const GraphConfig &config) bool GraphContext::insert_memory_management_ctx(MemoryManagerContext &&memory_ctx) { Target target = memory_ctx.target; - if(target == Target::UNSPECIFIED || _memory_managers.find(target) != std::end(_memory_managers)) + if (target == Target::UNSPECIFIED || _memory_managers.find(target) != std::end(_memory_managers)) { return false; } @@ -79,7 +78,7 @@ bool GraphContext::insert_weights_management_ctx(WeightsManagerContext &&weights { Target target = weights_managers.target; - if(_weights_managers.find(target) != std::end(_weights_managers)) + if (_weights_managers.find(target) != std::end(_weights_managers)) { return false; } @@ -102,17 +101,17 @@ std::map &GraphContext::weights_managers() void GraphContext::finalize() { const size_t num_pools = 1; - for(auto &mm_obj : _memory_managers) + for (auto &mm_obj : _memory_managers) { ARM_COMPUTE_ERROR_ON(!mm_obj.second.allocator); // Finalize intra layer memory manager - if(mm_obj.second.intra_mm != nullptr) + if (mm_obj.second.intra_mm != nullptr) { mm_obj.second.intra_mm->populate(*mm_obj.second.allocator, num_pools); } // Finalize cross layer memory manager - if(mm_obj.second.cross_mm != nullptr) + if (mm_obj.second.cross_mm != nullptr) { mm_obj.second.cross_mm->populate(*mm_obj.second.allocator, num_pools); } diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp index 45b608c70a..58ae60d4cc 100644 --- a/src/graph/GraphManager.cpp +++ b/src/graph/GraphManager.cpp @@ -23,15 +23,15 @@ */ #include "arm_compute/graph/GraphManager.h" +#include "arm_compute/graph/algorithms/TopologicalSort.h" +#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h" +#include "arm_compute/graph/detail/ExecutionHelpers.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/Logger.h" #include "arm_compute/graph/PassManager.h" #include "arm_compute/graph/TypePrinter.h" #include "arm_compute/graph/Utils.h" -#include "arm_compute/graph/algorithms/TopologicalSort.h" -#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h" -#include "arm_compute/graph/detail/ExecutionHelpers.h" #include "src/common/utils/Log.h" @@ -39,8 +39,7 @@ namespace arm_compute { namespace graph { -GraphManager::GraphManager() - : _workloads() +GraphManager::GraphManager() : _workloads() { } @@ -49,7 +48,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager & ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Initiate graph configuration!"); // Check if graph has been registered - if(_workloads.find(graph.id()) != std::end(_workloads)) + if (_workloads.find(graph.id()) != std::end(_workloads)) { ARM_COMPUTE_ERROR("Graph is already registered!"); } @@ -62,7 +61,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager & // In case CLVK is selected, use the CL backend and // update config - if(target == Target::CLVK) + if (target == Target::CLVK) { forced_target = Target::CL; GraphConfig config = ctx.config(); @@ -71,7 +70,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager & ctx.set_config(config); } - if(!is_target_supported(target)) + if (!is_target_supported(target)) { forced_target = get_default_target(); ARM_COMPUTE_LOG_GRAPH_INFO("Switching target from " << target << " to " << forced_target << std::endl); @@ -105,7 +104,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager & detail::prepare_all_tasks(workload); // Setup tensor memory (Allocate all tensors or setup transition manager) - if(ctx.config().use_transition_memory_manager) + if (ctx.config().use_transition_memory_manager) { detail::configure_transition_manager(graph, ctx, workload); } @@ -130,10 +129,10 @@ void GraphManager::execute_graph(Graph &graph) auto it = _workloads.find(graph.id()); ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_workloads), "Graph is not registered!"); - while(true) + while (true) { // Call input accessors - if(!detail::call_all_input_node_accessors(it->second)) + if (!detail::call_all_input_node_accessors(it->second)) { return; } @@ -142,7 +141,7 @@ void GraphManager::execute_graph(Graph &graph) detail::call_all_tasks(it->second); // Call output accessors - if(!detail::call_all_output_node_accessors(it->second)) + if (!detail::call_all_output_node_accessors(it->second)) { return; } @@ -157,4 +156,4 @@ void GraphManager::invalidate_graph(Graph &graph) _workloads.erase(it); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp index 70fe44e134..83c3ef7e37 100644 --- a/src/graph/INode.cpp +++ b/src/graph/INode.cpp @@ -75,17 +75,17 @@ void INode::set_assigned_target(Target target) void INode::set_output_tensor(TensorID tid, size_t idx) { - if(tid != NullTensorID && (idx < _outputs.size()) && (_graph->tensor(tid) != nullptr)) + if (tid != NullTensorID && (idx < _outputs.size()) && (_graph->tensor(tid) != nullptr)) { ARM_COMPUTE_ERROR_ON(_graph == nullptr); Tensor *updated_tensor = _graph->tensor(tid); _outputs[idx] = tid; // Set tensor to all output edges of the node - for(auto &output_edge_id : _output_edges) + for (auto &output_edge_id : _output_edges) { auto output_edge = _graph->edge(output_edge_id); - if(output_edge != nullptr) + if (output_edge != nullptr) { // Unbind edge from current tensor auto current_output_tensor = output_edge->tensor(); diff --git a/src/graph/INodeVisitor.cpp b/src/graph/INodeVisitor.cpp index 5369f6f539..90b2e3327f 100644 --- a/src/graph/INodeVisitor.cpp +++ b/src/graph/INodeVisitor.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/graph/INodeVisitor.h" + #include "arm_compute/graph/nodes/Nodes.h" namespace arm_compute diff --git a/src/graph/PassManager.cpp b/src/graph/PassManager.cpp index f7e214c1b4..9a889e1da3 100644 --- a/src/graph/PassManager.cpp +++ b/src/graph/PassManager.cpp @@ -29,8 +29,7 @@ namespace arm_compute { namespace graph { -PassManager::PassManager() - : _passes() +PassManager::PassManager() : _passes() { } @@ -46,7 +45,7 @@ IGraphMutator *PassManager::pass(size_t index) void PassManager::append(std::unique_ptr pass, bool conditional) { - if(pass && conditional) + if (pass && conditional) { ARM_COMPUTE_LOG_GRAPH_VERBOSE("Appending mutating pass : " << pass->name() << std::endl); _passes.push_back(std::move(pass)); @@ -60,9 +59,9 @@ void PassManager::clear() void PassManager::run_all(Graph &g) { - for(auto &pass : _passes) + for (auto &pass : _passes) { - if(pass) + if (pass) { ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl); pass->mutate(g); @@ -72,9 +71,9 @@ void PassManager::run_all(Graph &g) void PassManager::run_type(Graph &g, IGraphMutator::MutationType type) { - for(auto &pass : _passes) + for (auto &pass : _passes) { - if(pass && (pass->type() == type)) + if (pass && (pass->type() == type)) { ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl); pass->mutate(g); @@ -84,17 +83,17 @@ void PassManager::run_type(Graph &g, IGraphMutator::MutationType type) void PassManager::run_index(Graph &g, size_t index) { - if(index >= _passes.size()) + if (index >= _passes.size()) { return; } auto &pass = _passes.at(index); - if(pass != nullptr) + if (pass != nullptr) { ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl); pass->mutate(g); } } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp index 3d4723430f..72679c4ea4 100644 --- a/src/graph/Tensor.cpp +++ b/src/graph/Tensor.cpp @@ -75,20 +75,20 @@ std::unique_ptr Tensor::extract_accessor() bool Tensor::call_accessor() { // Early exit guard - if(!_accessor || !_handle) + if (!_accessor || !_handle) { return false; } const bool access_data = _accessor->access_tensor_data(); - if(access_data) + if (access_data) { // Map tensor _handle->map(true); // Return in case of null backend buffer - if(_handle->tensor().buffer() == nullptr) + if (_handle->tensor().buffer() == nullptr) { return false; } @@ -97,7 +97,7 @@ bool Tensor::call_accessor() // Call accessor bool retval = _accessor->access_tensor(_handle->tensor()); - if(access_data) + if (access_data) { // Unmap tensor _handle->unmap(); diff --git a/src/graph/TypeLoader.cpp b/src/graph/TypeLoader.cpp index 3c51289dba..e1248fbb6b 100644 --- a/src/graph/TypeLoader.cpp +++ b/src/graph/TypeLoader.cpp @@ -31,10 +31,9 @@ namespace arm_compute { arm_compute::DataLayout data_layout_from_name(const std::string &name) { - static const std::map data_layouts = - { - { "nhwc", DataLayout::NHWC }, - { "nchw", DataLayout::NCHW }, + static const std::map data_layouts = { + {"nhwc", DataLayout::NHWC}, + {"nchw", DataLayout::NCHW}, }; #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED @@ -45,7 +44,7 @@ arm_compute::DataLayout data_layout_from_name(const std::string &name) #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::out_of_range &) + catch (const std::out_of_range &) { throw std::invalid_argument(name); } @@ -55,11 +54,10 @@ namespace graph { Target target_from_name(const std::string &name) { - static const std::map targets = - { - { "neon", Target::NEON }, - { "cl", Target::CL }, - { "clvk", Target::CLVK }, + static const std::map targets = { + {"neon", Target::NEON}, + {"cl", Target::CL}, + {"clvk", Target::CLVK}, }; #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED @@ -70,7 +68,7 @@ Target target_from_name(const std::string &name) #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::out_of_range &) + catch (const std::out_of_range &) { throw std::invalid_argument(name); } @@ -79,12 +77,11 @@ Target target_from_name(const std::string &name) ConvolutionMethod Convolution_method_from_name(const std::string &name) { - static const std::map methods = - { - { "default", ConvolutionMethod::Default }, - { "direct", ConvolutionMethod::Direct }, - { "gemm", ConvolutionMethod::GEMM }, - { "winograd", ConvolutionMethod::Winograd }, + static const std::map methods = { + {"default", ConvolutionMethod::Default}, + {"direct", ConvolutionMethod::Direct}, + {"gemm", ConvolutionMethod::GEMM}, + {"winograd", ConvolutionMethod::Winograd}, }; #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED @@ -95,7 +92,7 @@ ConvolutionMethod Convolution_method_from_name(const std::string &name) #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::out_of_range &) + catch (const std::out_of_range &) { throw std::invalid_argument(name); } @@ -104,10 +101,9 @@ ConvolutionMethod Convolution_method_from_name(const std::string &name) DepthwiseConvolutionMethod depthwise_convolution_method_from_name(const std::string &name) { - static const std::map methods = - { - { "default", DepthwiseConvolutionMethod::Default }, - { "optimized3x3", DepthwiseConvolutionMethod::Optimized3x3 }, + static const std::map methods = { + {"default", DepthwiseConvolutionMethod::Default}, + {"optimized3x3", DepthwiseConvolutionMethod::Optimized3x3}, }; #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED @@ -118,7 +114,7 @@ DepthwiseConvolutionMethod depthwise_convolution_method_from_name(const std::str #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::out_of_range &) + catch (const std::out_of_range &) { throw std::invalid_argument(name); } diff --git a/src/graph/Utils.cpp b/src/graph/Utils.cpp index dcab177a3b..452d8ec7b2 100644 --- a/src/graph/Utils.cpp +++ b/src/graph/Utils.cpp @@ -23,8 +23,8 @@ */ #include "arm_compute/graph/Utils.h" -#include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/backends/BackendRegistry.h" +#include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/mutators/GraphMutators.h" namespace arm_compute @@ -33,16 +33,17 @@ namespace graph { bool is_target_supported(Target target) { - return backends::BackendRegistry::get().contains(target) && backends::BackendRegistry::get().find_backend(target)->is_backend_supported(); + return backends::BackendRegistry::get().contains(target) && + backends::BackendRegistry::get().find_backend(target)->is_backend_supported(); } Target get_default_target() { - if(is_target_supported(Target::NEON)) + if (is_target_supported(Target::NEON)) { return Target::NEON; } - if(is_target_supported(Target::CL)) + if (is_target_supported(Target::CL)) { return Target::CL; } @@ -52,18 +53,18 @@ Target get_default_target() void force_target_to_graph(Graph &g, Target target) { auto &nodes = g.nodes(); - for(auto &node : nodes) + for (auto &node : nodes) { - if(node) + if (node) { node->set_assigned_target(target); } } auto &tensors = g.tensors(); - for(auto &tensor : tensors) + for (auto &tensor : tensors) { - if(tensor) + if (tensor) { tensor->desc().target = target; } @@ -76,9 +77,9 @@ PassManager create_default_pass_manager(Target target, const GraphConfig &cfg) PassManager pm; // Passes that mutate graph IR - if(cfg.use_synthetic_type) + if (cfg.use_synthetic_type) { - switch(cfg.synthetic_type) + switch (cfg.synthetic_type) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: @@ -107,9 +108,9 @@ PassManager create_default_pass_manager(Target target, const GraphConfig &cfg) void release_default_graph_context(GraphContext &ctx) { - for(const auto &backend : backends::BackendRegistry::get().backends()) + for (const auto &backend : backends::BackendRegistry::get().backends()) { - if(backend.second->is_backend_supported()) + if (backend.second->is_backend_supported()) { backend.second->release_backend_context(ctx); } @@ -118,9 +119,9 @@ void release_default_graph_context(GraphContext &ctx) void sync_backends() { - for(const auto &backend : backends::BackendRegistry::get().backends()) + for (const auto &backend : backends::BackendRegistry::get().backends()) { - if(backend.second->backend_allocator()) + if (backend.second->backend_allocator()) { backend.second->sync(); } @@ -129,10 +130,10 @@ void sync_backends() void setup_requested_backend_context(GraphContext &ctx, Target target) { - if(backends::BackendRegistry::get().contains(target)) + if (backends::BackendRegistry::get().contains(target)) { const auto &backend = backends::BackendRegistry::get().find_backend(target); - if(backend->is_backend_supported()) + if (backend->is_backend_supported()) { backend->setup_backend_context(ctx); } @@ -141,20 +142,22 @@ void setup_requested_backend_context(GraphContext &ctx, Target target) size_t get_dimension_size(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension) { - ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!"); + ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, + "Cannot retrieve the dimension index for an unknown layout!"); return descriptor.shape[get_dimension_idx(descriptor.layout, data_layout_dimension)]; } size_t get_dimension_idx(DataLayout data_layout, const DataLayoutDimension data_layout_dimension) { - ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!"); + ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, + "Cannot retrieve the dimension index for an unknown layout!"); /* Return the index based on the data layout * [N C H W] * [3 2 1 0] * [N H W C] */ - switch(data_layout_dimension) + switch (data_layout_dimension) { case DataLayoutDimension::CHANNEL: return (data_layout == DataLayout::NCHW) ? 2 : 0; @@ -181,13 +184,13 @@ std::vector get_driving_nodes(const INode &node) const Graph *g = node.graph(); ARM_COMPUTE_ERROR_ON(g == nullptr); - for(auto &output_edge_id : node.output_edges()) + for (auto &output_edge_id : node.output_edges()) { auto output_edge = g->edge(output_edge_id); - if(output_edge != nullptr) + if (output_edge != nullptr) { ARM_COMPUTE_ERROR_ON(output_edge->consumer() == nullptr); - driving_nodes.push_back({ output_edge->consumer_id(), output_edge->consumer_idx() }); + driving_nodes.push_back({output_edge->consumer_id(), output_edge->consumer_idx()}); } } @@ -201,13 +204,13 @@ std::vector get_driver_nodes(const INode &node) const Graph *g = node.graph(); ARM_COMPUTE_ERROR_ON(g == nullptr); - for(auto &input_edge_id : node.input_edges()) + for (auto &input_edge_id : node.input_edges()) { auto input_edge = g->edge(input_edge_id); - if(input_edge != nullptr) + if (input_edge != nullptr) { ARM_COMPUTE_ERROR_ON(input_edge->producer() == nullptr); - driver_nodes.push_back({ input_edge->producer_id(), input_edge->producer_idx() }); + driver_nodes.push_back({input_edge->producer_id(), input_edge->producer_idx()}); } } @@ -216,7 +219,7 @@ std::vector get_driver_nodes(const INode &node) void configure_tensor(Tensor *tensor) { - if(tensor != nullptr && tensor->handle() == nullptr) + if (tensor != nullptr && tensor->handle() == nullptr) { Target target = tensor->desc().target; backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(target); diff --git a/src/graph/Workload.cpp b/src/graph/Workload.cpp index b9d57295b0..9dddad7cbd 100644 --- a/src/graph/Workload.cpp +++ b/src/graph/Workload.cpp @@ -40,12 +40,12 @@ void ExecutionTask::operator()() void execute_task(ExecutionTask &task) { - if(task.task) + if (task.task) { task.task->run(); } #ifdef ARM_COMPUTE_ASSERTS_ENABLED - else if(task.node->type() == NodeType::PrintLayer) + else if (task.node->type() == NodeType::PrintLayer) { auto print_node = utils::cast::polymorphic_downcast(task.node); auto input_handle = print_node->input(0)->handle(); @@ -61,14 +61,13 @@ void execute_task(ExecutionTask &task) void ExecutionTask::prepare() { - if(task) + if (task) { task->prepare(); } } -TaskExecutor::TaskExecutor() - : execute_function(execute_task) +TaskExecutor::TaskExecutor() : execute_function(execute_task) { } @@ -78,4 +77,4 @@ TaskExecutor &TaskExecutor::get() return executor; } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/algorithms/TopologicalSort.cpp b/src/graph/algorithms/TopologicalSort.cpp index 3a69352471..08e14e1657 100644 --- a/src/graph/algorithms/TopologicalSort.cpp +++ b/src/graph/algorithms/TopologicalSort.cpp @@ -50,14 +50,14 @@ inline bool all_inputs_are_visited(const INode *node, const std::vector &v ARM_COMPUTE_ERROR_ON(graph == nullptr); bool are_all_visited = true; - for(const auto &input_edge_id : node->input_edges()) + for (const auto &input_edge_id : node->input_edges()) { - if(input_edge_id != EmptyNodeID) + if (input_edge_id != EmptyNodeID) { const Edge *input_edge = graph->edge(input_edge_id); ARM_COMPUTE_ERROR_ON(input_edge == nullptr); ARM_COMPUTE_ERROR_ON(input_edge->producer() == nullptr); - if(!visited[input_edge->producer_id()]) + if (!visited[input_edge->producer_id()]) { are_all_visited = false; break; @@ -80,9 +80,9 @@ std::vector bfs(Graph &g) std::list queue; // Push inputs and mark as visited - for(auto &input : g.nodes(NodeType::Input)) + for (auto &input : g.nodes(NodeType::Input)) { - if(input != EmptyNodeID) + if (input != EmptyNodeID) { visited[input] = true; queue.push_back(input); @@ -90,9 +90,9 @@ std::vector bfs(Graph &g) } // Push const nodes and mark as visited - for(auto &const_node : g.nodes(NodeType::Const)) + for (auto &const_node : g.nodes(NodeType::Const)) { - if(const_node != EmptyNodeID) + if (const_node != EmptyNodeID) { visited[const_node] = true; queue.push_back(const_node); @@ -100,7 +100,7 @@ std::vector bfs(Graph &g) } // Iterate over vector and edges - while(!queue.empty()) + while (!queue.empty()) { // Dequeue a node from queue and process NodeID n = queue.front(); @@ -109,11 +109,11 @@ std::vector bfs(Graph &g) const INode *node = g.node(n); ARM_COMPUTE_ERROR_ON(node == nullptr); - for(const auto &eid : node->output_edges()) + for (const auto &eid : node->output_edges()) { const Edge *e = g.edge(eid); ARM_COMPUTE_ERROR_ON(e == nullptr); - if(!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited)) + if (!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited)) { visited[e->consumer_id()] = true; queue.push_back(e->consumer_id()); @@ -135,9 +135,9 @@ std::vector dfs(Graph &g) std::stack stack; // Push inputs and mark as visited - for(auto &input : g.nodes(NodeType::Input)) + for (auto &input : g.nodes(NodeType::Input)) { - if(input != EmptyNodeID) + if (input != EmptyNodeID) { visited[input] = true; stack.push(input); @@ -145,9 +145,9 @@ std::vector dfs(Graph &g) } // Push const nodes and mark as visited - for(auto &const_node : g.nodes(NodeType::Const)) + for (auto &const_node : g.nodes(NodeType::Const)) { - if(const_node != EmptyNodeID) + if (const_node != EmptyNodeID) { visited[const_node] = true; stack.push(const_node); @@ -155,7 +155,7 @@ std::vector dfs(Graph &g) } // Iterate over vector and edges - while(!stack.empty()) + while (!stack.empty()) { // Pop a node from stack and process NodeID n = stack.top(); @@ -163,7 +163,7 @@ std::vector dfs(Graph &g) stack.pop(); // Mark node as visited - if(!visited[n]) + if (!visited[n]) { visited[n] = true; } @@ -171,11 +171,11 @@ std::vector dfs(Graph &g) const INode *node = g.node(n); ARM_COMPUTE_ERROR_ON(node == nullptr); // Reverse iterate to push branches from right to left and pop on the opposite order - for(const auto &eid : arm_compute::utils::iterable::reverse_iterate(node->output_edges())) + for (const auto &eid : arm_compute::utils::iterable::reverse_iterate(node->output_edges())) { const Edge *e = g.edge(eid); ARM_COMPUTE_ERROR_ON(e == nullptr); - if(!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited)) + if (!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited)) { stack.push(e->consumer_id()); } diff --git a/src/graph/backends/BackendRegistry.cpp b/src/graph/backends/BackendRegistry.cpp index 46b4f99e23..bb6af79f8b 100644 --- a/src/graph/backends/BackendRegistry.cpp +++ b/src/graph/backends/BackendRegistry.cpp @@ -31,8 +31,7 @@ namespace graph { namespace backends { -BackendRegistry::BackendRegistry() - : _registered_backends() +BackendRegistry::BackendRegistry() : _registered_backends() { } diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp index 01e5ab1730..e27a4109d1 100644 --- a/src/graph/backends/CL/CLDeviceBackend.cpp +++ b/src/graph/backends/CL/CLDeviceBackend.cpp @@ -23,18 +23,17 @@ */ #include "arm_compute/graph/backends/CL/CLDeviceBackend.h" -#include "arm_compute/graph/Graph.h" -#include "arm_compute/graph/GraphContext.h" -#include "arm_compute/graph/INode.h" -#include "arm_compute/graph/Logger.h" -#include "arm_compute/graph/Tensor.h" +#include "arm_compute/core/TensorInfo.h" #include "arm_compute/graph/backends/BackendRegistrar.h" #include "arm_compute/graph/backends/CL/CLFunctionFactory.h" #include "arm_compute/graph/backends/CL/CLNodeValidator.h" #include "arm_compute/graph/backends/CL/CLSubTensorHandle.h" #include "arm_compute/graph/backends/CL/CLTensorHandle.h" - -#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/graph/Graph.h" +#include "arm_compute/graph/GraphContext.h" +#include "arm_compute/graph/INode.h" +#include "arm_compute/graph/Logger.h" +#include "arm_compute/graph/Tensor.h" #include "arm_compute/runtime/BlobLifetimeManager.h" #include "arm_compute/runtime/CL/CLBufferAllocator.h" #include "arm_compute/runtime/CL/CLScheduler.h" @@ -64,7 +63,12 @@ bool file_exists(const std::string &filename) static detail::BackendRegistrar CLDeviceBackend_registrar(Target::CL); CLDeviceBackend::CLDeviceBackend() - : _context_count(0), _tuner(), _gemm_heuristics(), _allocator(nullptr), _tuner_file(), _backend_type(CLBackendType::Native) + : _context_count(0), + _tuner(), + _gemm_heuristics(), + _allocator(nullptr), + _tuner_file(), + _backend_type(CLBackendType::Native) { } @@ -95,7 +99,7 @@ void CLDeviceBackend::release_backend_context(GraphContext &ctx) { ARM_COMPUTE_UNUSED(ctx); _context_count--; - if(_context_count == 0) // No more context using the backend: free resources + if (_context_count == 0) // No more context using the backend: free resources { _allocator = nullptr; } @@ -105,7 +109,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx) { // Force backend initialization _context_count++; - if(_context_count == 1) + if (_context_count == 1) { _backend_type = ctx.config().backend_type; initialize_backend(); @@ -115,7 +119,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx) _tuner_file = ctx.config().tuner_file; // Load tuner data if available - if(file_exists(_tuner_file)) + if (file_exists(_tuner_file)) { _tuner.load_from_file(_tuner_file); } @@ -128,7 +132,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx) CLScheduler::get().gemm_heuristics()->reload_from_file(ctx.config().mlgo_file); // Setup a management backend - if(ctx.memory_management_ctx(Target::CL) == nullptr) + if (ctx.memory_management_ctx(Target::CL) == nullptr) { MemoryManagerContext mm_ctx; mm_ctx.target = Target::CL; @@ -141,7 +145,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx) } // Create function level weights manager - if(ctx.weights_management_ctx(Target::CL) == nullptr) + if (ctx.weights_management_ctx(Target::CL) == nullptr) { WeightsManagerContext wm_ctx; wm_ctx.target = Target::CL; @@ -174,9 +178,10 @@ std::unique_ptr CLDeviceBackend::create_tensor(const Tensor &tens return std::make_unique(info); } -std::unique_ptr CLDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) +std::unique_ptr +CLDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) { - if(parent == nullptr) + if (parent == nullptr) { return nullptr; } @@ -203,7 +208,7 @@ arm_compute::Status CLDeviceBackend::validate_node(INode &node) std::shared_ptr CLDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity) { - if(affinity == MemoryManagerAffinity::Offset) + if (affinity == MemoryManagerAffinity::Offset) { ARM_COMPUTE_LOG_GRAPH_WARNING("CL Backend does not support offset affinity memory management!"); return nullptr; diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp index 882810474e..d4e1aa880f 100644 --- a/src/graph/backends/CL/CLFunctionsFactory.cpp +++ b/src/graph/backends/CL/CLFunctionsFactory.cpp @@ -22,12 +22,12 @@ * SOFTWARE. */ #include "arm_compute/graph/backends/CL/CLFunctionFactory.h" - +#include "arm_compute/graph/backends/FunctionHelpers.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/GraphContext.h" -#include "arm_compute/graph/backends/FunctionHelpers.h" #include "arm_compute/runtime/CL/CLFunctions.h" #include "arm_compute/runtime/CPP/CPPFunctions.h" + #include "src/core/CL/CLKernels.h" #include "support/Cast.h" @@ -89,20 +89,19 @@ class CPPWrapperFunction : public IFunction { public: /* Default constructor */ - CPPWrapperFunction() - : _tensors(), _func(nullptr) + CPPWrapperFunction() : _tensors(), _func(nullptr) { } void run() override { - for(auto &tensor : _tensors) + for (auto &tensor : _tensors) { tensor->map(CLScheduler::get().queue()); } _func->run(); - for(auto &tensor : _tensors) + for (auto &tensor : _tensors) { tensor->unmap(CLScheduler::get().queue()); } @@ -127,7 +126,8 @@ namespace detail { // Specialized functions template <> -std::unique_ptr create_detection_output_layer(DetectionOutputLayerNode &node) +std::unique_ptr +create_detection_output_layer(DetectionOutputLayerNode &node) { validate_node(node, 3 /* expected inputs */, 1 /* expected outputs */); @@ -149,16 +149,12 @@ std::unique_ptr create_detection_output_layerinfo()->data_type() - << " Input0 shape: " << input0->info()->tensor_shape() - << " Input1 shape: " << input1->info()->tensor_shape() + << node.name() << " Type: " << node.type() << " Target: " << CLTargetInfo::TargetType + << " Data Type: " << input0->info()->data_type() << " Input0 shape: " + << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape() << " Input2 shape: " << input2->info()->tensor_shape() << " Output shape: " << output->info()->tensor_shape() - << " DetectionOutputLayer info: " << detect_info - << std::endl); + << " DetectionOutputLayer info: " << detect_info << std::endl); auto wrap_function = std::make_unique(); @@ -171,7 +167,8 @@ std::unique_ptr create_detection_output_layer -std::unique_ptr create_detection_post_process_layer(DetectionPostProcessLayerNode &node) +std::unique_ptr +create_detection_post_process_layer(DetectionPostProcessLayerNode &node) { validate_node(node, 3 /* expected inputs */, 4 /* expected outputs */); @@ -199,19 +196,15 @@ std::unique_ptr create_detection_post_process_layerinfo()->data_type() - << " Input0 shape: " << input0->info()->tensor_shape() - << " Input1 shape: " << input1->info()->tensor_shape() + << node.name() << " Type: " << node.type() << " Target: " << CLTargetInfo::TargetType + << " Data Type: " << input0->info()->data_type() << " Input0 shape: " + << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape() << " Input2 shape: " << input2->info()->tensor_shape() << " Output0 shape: " << output0->info()->tensor_shape() << " Output1 shape: " << output1->info()->tensor_shape() << " Output2 shape: " << output2->info()->tensor_shape() << " Output3 shape: " << output3->info()->tensor_shape() - << " DetectionPostProcessLayer info: " << detect_info - << std::endl); + << " DetectionPostProcessLayer info: " << detect_info << std::endl); auto wrap_function = std::make_unique(); @@ -230,92 +223,128 @@ std::unique_ptr create_detection_post_process_layer CLFunctionFactory::create(INode *node, GraphContext &ctx) { - if(node == nullptr) + if (node == nullptr) { return nullptr; } NodeType type = node->type(); - switch(type) + switch (type) { case NodeType::ActivationLayer: - return detail::create_activation_layer(*polymorphic_downcast(node)); + return detail::create_activation_layer( + *polymorphic_downcast(node)); case NodeType::ArgMinMaxLayer: - return detail::create_arg_min_max_layer(*polymorphic_downcast(node)); + return detail::create_arg_min_max_layer( + *polymorphic_downcast(node)); case NodeType::BatchNormalizationLayer: - return detail::create_batch_normalization_layer(*polymorphic_downcast(node)); + return detail::create_batch_normalization_layer( + *polymorphic_downcast(node)); case NodeType::BoundingBoxTransformLayer: - return detail::create_bounding_box_transform_layer(*polymorphic_downcast(node)); + return detail::create_bounding_box_transform_layer( + *polymorphic_downcast(node)); case NodeType::ChannelShuffleLayer: - return detail::create_channel_shuffle_layer(*polymorphic_downcast(node)); + return detail::create_channel_shuffle_layer( + *polymorphic_downcast(node)); case NodeType::ConvolutionLayer: - return detail::create_convolution_layer(*polymorphic_downcast(node), ctx); + return detail::create_convolution_layer( + *polymorphic_downcast(node), ctx); case NodeType::DeconvolutionLayer: - return detail::create_deconvolution_layer(*polymorphic_downcast(node), ctx); + return detail::create_deconvolution_layer( + *polymorphic_downcast(node), ctx); case NodeType::ConcatenateLayer: - return detail::create_concatenate_layer(*polymorphic_downcast(node)); + return detail::create_concatenate_layer( + *polymorphic_downcast(node)); case NodeType::DepthToSpaceLayer: - return detail::create_depth_to_space_layer(*polymorphic_downcast(node)); + return detail::create_depth_to_space_layer( + *polymorphic_downcast(node)); case NodeType::DepthwiseConvolutionLayer: - return detail::create_depthwise_convolution_layer(*polymorphic_downcast(node)); + return detail::create_depthwise_convolution_layer( + *polymorphic_downcast(node)); case NodeType::DequantizationLayer: - return detail::create_dequantization_layer(*polymorphic_downcast(node)); + return detail::create_dequantization_layer( + *polymorphic_downcast(node)); case NodeType::DetectionOutputLayer: - return detail::create_detection_output_layer(*polymorphic_downcast(node)); + return detail::create_detection_output_layer( + *polymorphic_downcast(node)); case NodeType::DetectionPostProcessLayer: - return detail::create_detection_post_process_layer(*polymorphic_downcast(node)); + return detail::create_detection_post_process_layer( + *polymorphic_downcast(node)); case NodeType::EltwiseLayer: - return detail::create_eltwise_layer(*polymorphic_downcast(node)); + return detail::create_eltwise_layer( + *polymorphic_downcast(node)); case NodeType::UnaryEltwiseLayer: - return detail::create_unary_eltwise_layer(*polymorphic_downcast(node)); + return detail::create_unary_eltwise_layer( + *polymorphic_downcast(node)); case NodeType::FlattenLayer: - return detail::create_flatten_layer(*polymorphic_downcast(node)); + return detail::create_flatten_layer( + *polymorphic_downcast(node)); case NodeType::FullyConnectedLayer: - return detail::create_fully_connected_layer(*polymorphic_downcast(node), ctx); + return detail::create_fully_connected_layer( + *polymorphic_downcast(node), ctx); case NodeType::FusedConvolutionBatchNormalizationLayer: - return detail::create_fused_convolution_batch_normalization_layer(*polymorphic_downcast(node), ctx); + return detail::create_fused_convolution_batch_normalization_layer( + *polymorphic_downcast(node), ctx); case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer: - return detail::create_fused_depthwise_convolution_batch_normalization_layer(*polymorphic_downcast(node), ctx); + return detail::create_fused_depthwise_convolution_batch_normalization_layer( + *polymorphic_downcast(node), ctx); case NodeType::GenerateProposalsLayer: - return detail::create_generate_proposals_layer(*polymorphic_downcast(node), ctx); + return detail::create_generate_proposals_layer( + *polymorphic_downcast(node), ctx); case NodeType::L2NormalizeLayer: - return detail::create_l2_normalize_layer(*polymorphic_downcast(node), ctx); + return detail::create_l2_normalize_layer( + *polymorphic_downcast(node), ctx); case NodeType::NormalizationLayer: - return detail::create_normalization_layer(*polymorphic_downcast(node), ctx); + return detail::create_normalization_layer( + *polymorphic_downcast(node), ctx); case NodeType::NormalizePlanarYUVLayer: - return detail::create_normalize_planar_yuv_layer(*polymorphic_downcast(node)); + return detail::create_normalize_planar_yuv_layer( + *polymorphic_downcast(node)); case NodeType::PadLayer: return detail::create_pad_layer(*polymorphic_downcast(node)); case NodeType::PermuteLayer: - return detail::create_permute_layer(*polymorphic_downcast(node)); + return detail::create_permute_layer( + *polymorphic_downcast(node)); case NodeType::PoolingLayer: - return detail::create_pooling_layer(*polymorphic_downcast(node)); + return detail::create_pooling_layer( + *polymorphic_downcast(node)); case NodeType::PReluLayer: - return detail::create_prelu_layer(*polymorphic_downcast(node)); + return detail::create_prelu_layer( + *polymorphic_downcast(node)); case NodeType::PrintLayer: return detail::create_print_layer(*polymorphic_downcast(node)); case NodeType::PriorBoxLayer: - return detail::create_priorbox_layer(*polymorphic_downcast(node)); + return detail::create_priorbox_layer( + *polymorphic_downcast(node)); case NodeType::QuantizationLayer: - return detail::create_quantization_layer(*polymorphic_downcast(node)); + return detail::create_quantization_layer( + *polymorphic_downcast(node)); case NodeType::ReductionOperationLayer: - return detail::create_reduction_operation_layer(*polymorphic_downcast(node), ctx); + return detail::create_reduction_operation_layer( + *polymorphic_downcast(node), ctx); case NodeType::ReorgLayer: - return detail::create_reorg_layer(*polymorphic_downcast(node)); + return detail::create_reorg_layer( + *polymorphic_downcast(node)); case NodeType::ReshapeLayer: - return detail::create_reshape_layer(*polymorphic_downcast(node)); + return detail::create_reshape_layer( + *polymorphic_downcast(node)); case NodeType::ResizeLayer: return detail::create_resize_layer(*polymorphic_downcast(node)); case NodeType::ROIAlignLayer: - return detail::create_roi_align_layer(*polymorphic_downcast(node)); + return detail::create_roi_align_layer( + *polymorphic_downcast(node)); case NodeType::SliceLayer: return detail::create_slice_layer(*polymorphic_downcast(node)); case NodeType::SoftmaxLayer: - return detail::create_softmax_layer(*polymorphic_downcast(node), ctx); + return detail::create_softmax_layer( + *polymorphic_downcast(node), ctx); case NodeType::StackLayer: - return detail::create_stack_layer(*polymorphic_downcast(node)); + return detail::create_stack_layer( + *polymorphic_downcast(node)); case NodeType::StridedSliceLayer: - return detail::create_strided_slice_layer(*polymorphic_downcast(node)); + return detail::create_strided_slice_layer( + *polymorphic_downcast(node)); default: return nullptr; } diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp index 8fd8c14f63..510eda7935 100644 --- a/src/graph/backends/CL/CLNodeValidator.cpp +++ b/src/graph/backends/CL/CLNodeValidator.cpp @@ -25,7 +25,6 @@ #include "arm_compute/graph/backends/ValidateHelpers.h" #include "arm_compute/graph/nodes/Nodes.h" - #include "arm_compute/runtime/CL/CLFunctions.h" #include "arm_compute/runtime/CPP/CPPFunctions.h" @@ -57,41 +56,51 @@ struct CLUnaryEltwiseLayerFunctions Status CLNodeValidator::validate(INode *node) { - if(node == nullptr) + if (node == nullptr) { return Status{}; } NodeType type = node->type(); - switch(type) + switch (type) { case NodeType::ArgMinMaxLayer: - return detail::validate_arg_min_max_layer(*polymorphic_downcast(node)); + return detail::validate_arg_min_max_layer( + *polymorphic_downcast(node)); case NodeType::BoundingBoxTransformLayer: - return detail::validate_bounding_box_transform_layer(*polymorphic_downcast(node)); + return detail::validate_bounding_box_transform_layer( + *polymorphic_downcast(node)); case NodeType::ChannelShuffleLayer: - return detail::validate_channel_shuffle_layer(*polymorphic_downcast(node)); + return detail::validate_channel_shuffle_layer( + *polymorphic_downcast(node)); case NodeType::ConvolutionLayer: - return detail::validate_convolution_layer(*polymorphic_downcast(node)); + return detail::validate_convolution_layer( + *polymorphic_downcast(node)); case NodeType::DepthToSpaceLayer: - return detail::validate_depth_to_space_layer(*polymorphic_downcast(node)); + return detail::validate_depth_to_space_layer( + *polymorphic_downcast(node)); case NodeType::DepthwiseConvolutionLayer: - return detail::validate_depthwise_convolution_layer(*polymorphic_downcast(node)); + return detail::validate_depthwise_convolution_layer( + *polymorphic_downcast(node)); case NodeType::DequantizationLayer: - return detail::validate_dequantization_layer(*polymorphic_downcast(node)); + return detail::validate_dequantization_layer( + *polymorphic_downcast(node)); case NodeType::DetectionOutputLayer: - return detail::validate_detection_output_layer(*polymorphic_downcast(node)); + return detail::validate_detection_output_layer( + *polymorphic_downcast(node)); case NodeType::DetectionPostProcessLayer: - return detail::validate_detection_post_process_layer(*polymorphic_downcast(node)); + return detail::validate_detection_post_process_layer( + *polymorphic_downcast(node)); case NodeType::GenerateProposalsLayer: - return detail::validate_generate_proposals_layer(*polymorphic_downcast(node)); + return detail::validate_generate_proposals_layer( + *polymorphic_downcast(node)); case NodeType::L2NormalizeLayer: - return detail::validate_l2_normalize_layer(*polymorphic_downcast(node)); + return detail::validate_l2_normalize_layer( + *polymorphic_downcast(node)); case NodeType::NormalizePlanarYUVLayer: - return detail::validate_normalize_planar_yuv_layer(*polymorphic_downcast(node)); + return detail::validate_normalize_planar_yuv_layer( + *polymorphic_downcast(node)); case NodeType::PadLayer: return detail::validate_pad_layer(*polymorphic_downcast(node)); case NodeType::PermuteLayer: @@ -101,9 +110,11 @@ Status CLNodeValidator::validate(INode *node) case NodeType::PriorBoxLayer: return detail::validate_priorbox_layer(*polymorphic_downcast(node)); case NodeType::QuantizationLayer: - return detail::validate_quantization_layer(*polymorphic_downcast(node)); + return detail::validate_quantization_layer( + *polymorphic_downcast(node)); case NodeType::ReductionOperationLayer: - return detail::validate_reduction_operation_layer(*polymorphic_downcast(node)); + return detail::validate_reduction_operation_layer( + *polymorphic_downcast(node)); case NodeType::ReorgLayer: return detail::validate_reorg_layer(*polymorphic_downcast(node)); case NodeType::ReshapeLayer: @@ -113,11 +124,14 @@ Status CLNodeValidator::validate(INode *node) case NodeType::SliceLayer: return detail::validate_slice_layer(*polymorphic_downcast(node)); case NodeType::StridedSliceLayer: - return detail::validate_strided_slice_layer(*polymorphic_downcast(node)); + return detail::validate_strided_slice_layer( + *polymorphic_downcast(node)); case NodeType::EltwiseLayer: - return detail::validate_eltwise_Layer(*polymorphic_downcast(node)); + return detail::validate_eltwise_Layer( + *polymorphic_downcast(node)); case NodeType::UnaryEltwiseLayer: - return detail::validate_unary_eltwise_layer(*polymorphic_downcast(node)); + return detail::validate_unary_eltwise_layer( + *polymorphic_downcast(node)); default: return Status{}; } diff --git a/src/graph/backends/CL/CLSubTensorHandle.cpp b/src/graph/backends/CL/CLSubTensorHandle.cpp index b97d25890a..ccdc877a18 100644 --- a/src/graph/backends/CL/CLSubTensorHandle.cpp +++ b/src/graph/backends/CL/CLSubTensorHandle.cpp @@ -31,7 +31,10 @@ namespace graph { namespace backends { -CLSubTensorHandle::CLSubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent) +CLSubTensorHandle::CLSubTensorHandle(ITensorHandle *parent_handle, + const TensorShape &shape, + const Coordinates &coords, + bool extend_parent) : _sub_tensor(), _parent_handle(nullptr) { ARM_COMPUTE_ERROR_ON(!parent_handle); @@ -98,4 +101,4 @@ Target CLSubTensorHandle::target() const } } // namespace backends } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/backends/CL/CLTensorHandle.cpp b/src/graph/backends/CL/CLTensorHandle.cpp index a496c2ce47..1b69f9dede 100644 --- a/src/graph/backends/CL/CLTensorHandle.cpp +++ b/src/graph/backends/CL/CLTensorHandle.cpp @@ -31,8 +31,7 @@ namespace graph { namespace backends { -CLTensorHandle::CLTensorHandle(const ITensorInfo &info) - : _tensor() +CLTensorHandle::CLTensorHandle(const ITensorInfo &info) : _tensor() { _tensor.allocator()->init(info); } @@ -49,7 +48,7 @@ void CLTensorHandle::free() void CLTensorHandle::manage(IMemoryGroup *mg) { - if(mg != nullptr) + if (mg != nullptr) { mg->manage(&_tensor); } @@ -68,7 +67,7 @@ void CLTensorHandle::unmap() void CLTensorHandle::release_if_unused() { // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used - if(!_tensor.is_used()) + if (!_tensor.is_used()) { _tensor.allocator()->free(); } @@ -100,4 +99,4 @@ Target CLTensorHandle::target() const } } // namespace backends } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp index 18456538da..fc7b309803 100644 --- a/src/graph/backends/NEON/NEDeviceBackend.cpp +++ b/src/graph/backends/NEON/NEDeviceBackend.cpp @@ -23,18 +23,17 @@ */ #include "arm_compute/graph/backends/NEON/NEDeviceBackend.h" -#include "arm_compute/graph/Graph.h" -#include "arm_compute/graph/GraphContext.h" -#include "arm_compute/graph/INode.h" -#include "arm_compute/graph/Logger.h" -#include "arm_compute/graph/Tensor.h" +#include "arm_compute/core/TensorInfo.h" #include "arm_compute/graph/backends/BackendRegistrar.h" #include "arm_compute/graph/backends/NEON/NEFunctionFactory.h" #include "arm_compute/graph/backends/NEON/NENodeValidator.h" #include "arm_compute/graph/backends/NEON/NESubTensorHandle.h" #include "arm_compute/graph/backends/NEON/NETensorHandle.h" - -#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/graph/Graph.h" +#include "arm_compute/graph/GraphContext.h" +#include "arm_compute/graph/INode.h" +#include "arm_compute/graph/Logger.h" +#include "arm_compute/graph/Tensor.h" #include "arm_compute/runtime/Allocator.h" #include "arm_compute/runtime/BlobLifetimeManager.h" #include "arm_compute/runtime/IWeightsManager.h" @@ -53,8 +52,7 @@ namespace backends /** Register CPU backend */ static detail::BackendRegistrar NEDeviceBackend_registrar(Target::NEON); -NEDeviceBackend::NEDeviceBackend() - : _allocator() +NEDeviceBackend::NEDeviceBackend() : _allocator() { } @@ -72,13 +70,13 @@ void NEDeviceBackend::release_backend_context(GraphContext &ctx) void NEDeviceBackend::setup_backend_context(GraphContext &ctx) { // Set number of threads - if(ctx.config().num_threads >= 0) + if (ctx.config().num_threads >= 0) { Scheduler::get().set_num_threads(ctx.config().num_threads); } // Create function level memory manager - if(ctx.memory_management_ctx(Target::NEON) == nullptr) + if (ctx.memory_management_ctx(Target::NEON) == nullptr) { MemoryManagerContext mm_ctx; mm_ctx.target = Target::NEON; @@ -91,7 +89,7 @@ void NEDeviceBackend::setup_backend_context(GraphContext &ctx) } // Create function level weights manager - if(ctx.weights_management_ctx(Target::NEON) == nullptr) + if (ctx.weights_management_ctx(Target::NEON) == nullptr) { WeightsManagerContext wm_ctx; wm_ctx.target = Target::NEON; @@ -124,9 +122,10 @@ std::unique_ptr NEDeviceBackend::create_tensor(const Tensor &tens return std::make_unique(info); } -std::unique_ptr NEDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) +std::unique_ptr +NEDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) { - if(parent == nullptr) + if (parent == nullptr) { return nullptr; } @@ -154,7 +153,7 @@ arm_compute::Status NEDeviceBackend::validate_node(INode &node) std::shared_ptr NEDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity) { std::shared_ptr lifetime_mgr = nullptr; - if(affinity == MemoryManagerAffinity::Buffer) + if (affinity == MemoryManagerAffinity::Buffer) { lifetime_mgr = std::make_shared(); } diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp index d7ed5f9ecb..fe15d4cec1 100644 --- a/src/graph/backends/NEON/NEFunctionFactory.cpp +++ b/src/graph/backends/NEON/NEFunctionFactory.cpp @@ -23,13 +23,13 @@ */ #include "arm_compute/graph/backends/NEON/NEFunctionFactory.h" +#include "arm_compute/graph/backends/FunctionHelpers.h" +#include "arm_compute/graph/backends/Utils.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/Logger.h" -#include "arm_compute/graph/TypePrinter.h" -#include "arm_compute/graph/backends/FunctionHelpers.h" -#include "arm_compute/graph/backends/Utils.h" #include "arm_compute/graph/nodes/Nodes.h" +#include "arm_compute/graph/TypePrinter.h" #include "arm_compute/runtime/CPP/CPPFunctions.h" #include "arm_compute/runtime/NEON/NEFunctions.h" @@ -88,7 +88,8 @@ struct NEFusedLayerTypes namespace detail { template <> -std::unique_ptr create_normalization_layer(NormalizationLayerNode &node, GraphContext &ctx) +std::unique_ptr create_normalization_layer(NormalizationLayerNode &node, + GraphContext &ctx) { validate_node(node, 1 /* expected inputs */, 1 /* expected outputs */); @@ -105,14 +106,10 @@ std::unique_ptr create_normalization_layerinfo()->data_type() - << " Input shape: " << input->info()->tensor_shape() - << " Output shape: " << output->info()->tensor_shape() - << " Normalization info: " << norm_info.type() - << std::endl); + << node.name() << " Type: " << node.type() << " Target: " << NETargetInfo::TargetType + << " Data Type: " << input->info()->data_type() << " Input shape: " + << input->info()->tensor_shape() << " Output shape: " << output->info()->tensor_shape() + << " Normalization info: " << norm_info.type() << std::endl); return func; } @@ -120,84 +117,116 @@ std::unique_ptr create_normalization_layer NEFunctionFactory::create(INode *node, GraphContext &ctx) { - if(node == nullptr) + if (node == nullptr) { return nullptr; } NodeType type = node->type(); - switch(type) + switch (type) { case NodeType::ActivationLayer: - return detail::create_activation_layer(*polymorphic_downcast(node)); + return detail::create_activation_layer( + *polymorphic_downcast(node)); case NodeType::ArgMinMaxLayer: - return detail::create_arg_min_max_layer(*polymorphic_downcast(node)); + return detail::create_arg_min_max_layer( + *polymorphic_downcast(node)); case NodeType::BatchNormalizationLayer: - return detail::create_batch_normalization_layer(*polymorphic_downcast(node)); + return detail::create_batch_normalization_layer( + *polymorphic_downcast(node)); case NodeType::ChannelShuffleLayer: - return detail::create_channel_shuffle_layer(*polymorphic_downcast(node)); + return detail::create_channel_shuffle_layer( + *polymorphic_downcast(node)); case NodeType::ConvolutionLayer: - return detail::create_convolution_layer(*polymorphic_downcast(node), ctx); + return detail::create_convolution_layer( + *polymorphic_downcast(node), ctx); case NodeType::DepthToSpaceLayer: - return detail::create_depth_to_space_layer(*polymorphic_downcast(node)); + return detail::create_depth_to_space_layer( + *polymorphic_downcast(node)); case NodeType::DeconvolutionLayer: - return detail::create_deconvolution_layer(*polymorphic_downcast(node), ctx); + return detail::create_deconvolution_layer( + *polymorphic_downcast(node), ctx); case NodeType::ConcatenateLayer: - return detail::create_concatenate_layer(*polymorphic_downcast(node)); + return detail::create_concatenate_layer( + *polymorphic_downcast(node)); case NodeType::DepthwiseConvolutionLayer: - return detail::create_depthwise_convolution_layer(*polymorphic_downcast(node)); + return detail::create_depthwise_convolution_layer( + *polymorphic_downcast(node)); case NodeType::DequantizationLayer: - return detail::create_dequantization_layer(*polymorphic_downcast(node)); + return detail::create_dequantization_layer( + *polymorphic_downcast(node)); case NodeType::DetectionOutputLayer: - return detail::create_detection_output_layer(*polymorphic_downcast(node)); + return detail::create_detection_output_layer( + *polymorphic_downcast(node)); case NodeType::DetectionPostProcessLayer: - return detail::create_detection_post_process_layer(*polymorphic_downcast(node)); + return detail::create_detection_post_process_layer( + *polymorphic_downcast(node)); case NodeType::EltwiseLayer: - return detail::create_eltwise_layer(*polymorphic_downcast(node)); + return detail::create_eltwise_layer( + *polymorphic_downcast(node)); case NodeType::UnaryEltwiseLayer: - return detail::create_unary_eltwise_layer(*polymorphic_downcast(node)); + return detail::create_unary_eltwise_layer( + *polymorphic_downcast(node)); case NodeType::FlattenLayer: - return detail::create_flatten_layer(*polymorphic_downcast(node)); + return detail::create_flatten_layer( + *polymorphic_downcast(node)); case NodeType::FullyConnectedLayer: - return detail::create_fully_connected_layer(*polymorphic_downcast(node), ctx); + return detail::create_fully_connected_layer( + *polymorphic_downcast(node), ctx); case NodeType::FusedConvolutionBatchNormalizationLayer: - return detail::create_fused_convolution_batch_normalization_layer(*polymorphic_downcast(node), ctx); + return detail::create_fused_convolution_batch_normalization_layer( + *polymorphic_downcast(node), ctx); case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer: - return detail::create_fused_depthwise_convolution_batch_normalization_layer(*polymorphic_downcast(node), ctx); + return detail::create_fused_depthwise_convolution_batch_normalization_layer( + *polymorphic_downcast(node), ctx); case NodeType::L2NormalizeLayer: - return detail::create_l2_normalize_layer(*polymorphic_downcast(node), ctx); + return detail::create_l2_normalize_layer( + *polymorphic_downcast(node), ctx); case NodeType::NormalizationLayer: - return detail::create_normalization_layer(*polymorphic_downcast(node), ctx); + return detail::create_normalization_layer( + *polymorphic_downcast(node), ctx); case NodeType::PadLayer: return detail::create_pad_layer(*polymorphic_downcast(node)); case NodeType::PermuteLayer: - return detail::create_permute_layer(*polymorphic_downcast(node)); + return detail::create_permute_layer( + *polymorphic_downcast(node)); case NodeType::PoolingLayer: - return detail::create_pooling_layer(*polymorphic_downcast(node)); + return detail::create_pooling_layer( + *polymorphic_downcast(node)); case NodeType::PReluLayer: - return detail::create_prelu_layer(*polymorphic_downcast(node)); + return detail::create_prelu_layer( + *polymorphic_downcast(node)); case NodeType::PrintLayer: return detail::create_print_layer(*polymorphic_downcast(node)); case NodeType::PriorBoxLayer: - return detail::create_priorbox_layer(*polymorphic_downcast(node)); + return detail::create_priorbox_layer( + *polymorphic_downcast(node)); case NodeType::QuantizationLayer: - return detail::create_quantization_layer(*polymorphic_downcast(node)); + return detail::create_quantization_layer( + *polymorphic_downcast(node)); case NodeType::ReductionOperationLayer: - return detail::create_reduction_operation_layer(*polymorphic_downcast(node), ctx); + return detail::create_reduction_operation_layer( + *polymorphic_downcast(node), ctx); case NodeType::ReorgLayer: - return detail::create_reorg_layer(*polymorphic_downcast(node)); + return detail::create_reorg_layer( + *polymorphic_downcast(node)); case NodeType::ReshapeLayer: - return detail::create_reshape_layer(*polymorphic_downcast(node)); + return detail::create_reshape_layer( + *polymorphic_downcast(node)); case NodeType::ResizeLayer: return detail::create_resize_layer(*polymorphic_downcast(node)); case NodeType::SliceLayer: return detail::create_slice_layer(*polymorphic_downcast(node)); case NodeType::SoftmaxLayer: - return detail::create_softmax_layer(*polymorphic_downcast(node), ctx); + return detail::create_softmax_layer( + *polymorphic_downcast(node), ctx); case NodeType::StackLayer: - return detail::create_stack_layer(*polymorphic_downcast(node)); + return detail::create_stack_layer( + *polymorphic_downcast(node)); case NodeType::StridedSliceLayer: - return detail::create_strided_slice_layer(*polymorphic_downcast(node)); + return detail::create_strided_slice_layer( + *polymorphic_downcast(node)); default: return nullptr; } diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp index a485e5d235..a97806f92c 100644 --- a/src/graph/backends/NEON/NENodeValidator.cpp +++ b/src/graph/backends/NEON/NENodeValidator.cpp @@ -25,9 +25,9 @@ #include "arm_compute/graph/backends/ValidateHelpers.h" #include "arm_compute/graph/nodes/Nodes.h" - #include "arm_compute/runtime/CPP/CPPFunctions.h" #include "arm_compute/runtime/NEON/NEFunctions.h" + #include "support/Cast.h" using namespace arm_compute::utils::cast; @@ -56,41 +56,51 @@ struct NEUnaryEltwiseLayerFunctions Status NENodeValidator::validate(INode *node) { - if(node == nullptr) + if (node == nullptr) { return Status{}; } NodeType type = node->type(); - switch(type) + switch (type) { case NodeType::ArgMinMaxLayer: - return detail::validate_arg_min_max_layer(*polymorphic_downcast(node)); + return detail::validate_arg_min_max_layer( + *polymorphic_downcast(node)); case NodeType::BoundingBoxTransformLayer: - return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : BoundingBoxTransformLayer"); + return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, + "Unsupported operation : BoundingBoxTransformLayer"); case NodeType::ChannelShuffleLayer: - return detail::validate_channel_shuffle_layer(*polymorphic_downcast(node)); + return detail::validate_channel_shuffle_layer( + *polymorphic_downcast(node)); case NodeType::ConvolutionLayer: - return detail::validate_convolution_layer(*polymorphic_downcast(node)); + return detail::validate_convolution_layer( + *polymorphic_downcast(node)); case NodeType::DepthToSpaceLayer: - return detail::validate_depth_to_space_layer(*polymorphic_downcast(node)); + return detail::validate_depth_to_space_layer( + *polymorphic_downcast(node)); case NodeType::DepthwiseConvolutionLayer: - return detail::validate_depthwise_convolution_layer(*polymorphic_downcast(node)); + return detail::validate_depthwise_convolution_layer( + *polymorphic_downcast(node)); case NodeType::DequantizationLayer: - return detail::validate_dequantization_layer(*polymorphic_downcast(node)); + return detail::validate_dequantization_layer( + *polymorphic_downcast(node)); case NodeType::DetectionOutputLayer: - return detail::validate_detection_output_layer(*polymorphic_downcast(node)); + return detail::validate_detection_output_layer( + *polymorphic_downcast(node)); case NodeType::DetectionPostProcessLayer: - return detail::validate_detection_post_process_layer(*polymorphic_downcast(node)); + return detail::validate_detection_post_process_layer( + *polymorphic_downcast(node)); case NodeType::GenerateProposalsLayer: - return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer"); + return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, + "Unsupported operation : GenerateProposalsLayer"); case NodeType::L2NormalizeLayer: - return detail::validate_l2_normalize_layer(*polymorphic_downcast(node)); + return detail::validate_l2_normalize_layer( + *polymorphic_downcast(node)); case NodeType::NormalizePlanarYUVLayer: - return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : NormalizePlanarYUVLayer"); + return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, + "Unsupported operation : NormalizePlanarYUVLayer"); case NodeType::PadLayer: return detail::validate_pad_layer(*polymorphic_downcast(node)); case NodeType::PermuteLayer: @@ -100,23 +110,29 @@ Status NENodeValidator::validate(INode *node) case NodeType::PriorBoxLayer: return detail::validate_priorbox_layer(*polymorphic_downcast(node)); case NodeType::QuantizationLayer: - return detail::validate_quantization_layer(*polymorphic_downcast(node)); + return detail::validate_quantization_layer( + *polymorphic_downcast(node)); case NodeType::ReductionOperationLayer: - return detail::validate_reduction_operation_layer(*polymorphic_downcast(node)); + return detail::validate_reduction_operation_layer( + *polymorphic_downcast(node)); case NodeType::ReorgLayer: return detail::validate_reorg_layer(*polymorphic_downcast(node)); case NodeType::ReshapeLayer: return detail::validate_reshape_layer(*polymorphic_downcast(node)); case NodeType::ROIAlignLayer: - return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer"); + return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, + "Unsupported operation : ROIAlignLayer"); case NodeType::SliceLayer: return detail::validate_slice_layer(*polymorphic_downcast(node)); case NodeType::StridedSliceLayer: - return detail::validate_strided_slice_layer(*polymorphic_downcast(node)); + return detail::validate_strided_slice_layer( + *polymorphic_downcast(node)); case NodeType::EltwiseLayer: - return detail::validate_eltwise_Layer(*polymorphic_downcast(node)); + return detail::validate_eltwise_Layer( + *polymorphic_downcast(node)); case NodeType::UnaryEltwiseLayer: - return detail::validate_unary_eltwise_layer(*polymorphic_downcast(node)); + return detail::validate_unary_eltwise_layer( + *polymorphic_downcast(node)); default: return Status{}; } diff --git a/src/graph/backends/NEON/NESubTensorHandle.cpp b/src/graph/backends/NEON/NESubTensorHandle.cpp index 36f29d0d10..8964a00c5e 100644 --- a/src/graph/backends/NEON/NESubTensorHandle.cpp +++ b/src/graph/backends/NEON/NESubTensorHandle.cpp @@ -29,7 +29,10 @@ namespace graph { namespace backends { -NESubTensorHandle::NESubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent) +NESubTensorHandle::NESubTensorHandle(ITensorHandle *parent_handle, + const TensorShape &shape, + const Coordinates &coords, + bool extend_parent) : _sub_tensor(), _parent_handle(nullptr) { ARM_COMPUTE_ERROR_ON(!parent_handle); @@ -95,4 +98,4 @@ Target NESubTensorHandle::target() const } } // namespace backends } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/backends/NEON/NETensorHandle.cpp b/src/graph/backends/NEON/NETensorHandle.cpp index 4393156e8a..dabf67060d 100644 --- a/src/graph/backends/NEON/NETensorHandle.cpp +++ b/src/graph/backends/NEON/NETensorHandle.cpp @@ -24,6 +24,7 @@ #include "arm_compute/graph/backends/NEON/NETensorHandle.h" #include "arm_compute/runtime/MemoryGroup.h" + #include "support/Cast.h" namespace arm_compute @@ -32,8 +33,7 @@ namespace graph { namespace backends { -NETensorHandle::NETensorHandle(const ITensorInfo &info) - : _tensor() +NETensorHandle::NETensorHandle(const ITensorInfo &info) : _tensor() { _tensor.allocator()->init(info); } @@ -50,7 +50,7 @@ void NETensorHandle::free() void NETensorHandle::manage(IMemoryGroup *mg) { - if(mg != nullptr) + if (mg != nullptr) { mg->manage(&_tensor); } @@ -68,7 +68,7 @@ void NETensorHandle::unmap() void NETensorHandle::release_if_unused() { // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used - if(!_tensor.is_used()) + if (!_tensor.is_used()) { _tensor.allocator()->free(); } @@ -100,4 +100,4 @@ Target NETensorHandle::target() const } } // namespace backends } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp index b45f453f23..1e813dc678 100644 --- a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp +++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp @@ -23,6 +23,8 @@ */ #include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/graph/backends/BackendRegistry.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/GraphManager.h" @@ -30,9 +32,7 @@ #include "arm_compute/graph/Tensor.h" #include "arm_compute/graph/Types.h" #include "arm_compute/graph/Utils.h" -#include "arm_compute/graph/backends/BackendRegistry.h" -#include "arm_compute/core/ITensor.h" #include "support/Cast.h" #include @@ -78,28 +78,28 @@ IMemoryGroup *get_memory_group_from_handle(GraphContext &ctx, ITensorHandle *han */ std::set get_const_handles(const Graph &g) { - std::set const_node_types = { NodeType::Input, NodeType::Output, NodeType::Const }; + std::set const_node_types = {NodeType::Input, NodeType::Output, NodeType::Const}; std::set const_tensors; auto &nodes = g.nodes(); - for(auto &node : nodes) + for (auto &node : nodes) { // If its a const node: - if(node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types)) + if (node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types)) { // TODO (geopin01) : Create IO iterator wrappers // Add all its inputs / outputs to the list of constant handles - for(unsigned int i = 0; i < node->num_inputs(); ++i) + for (unsigned int i = 0; i < node->num_inputs(); ++i) { - if(node->input(i) != nullptr) + if (node->input(i) != nullptr) { const_tensors.insert(node->input(i)->handle()->parent_handle()); } } - for(unsigned int i = 0; i < node->num_outputs(); ++i) + for (unsigned int i = 0; i < node->num_outputs(); ++i) { - if(node->output(i) != nullptr) + if (node->output(i) != nullptr) { const_tensors.insert(node->output(i)->handle()->parent_handle()); } @@ -118,9 +118,8 @@ std::set get_const_handles(const Graph &g) * * @return List of transition handles */ -TaskHandles get_transition_handles(GraphContext &ctx, - ExecutionTask &task, - const std::set &const_tensors) +TaskHandles +get_transition_handles(GraphContext &ctx, ExecutionTask &task, const std::set &const_tensors) { ARM_COMPUTE_ERROR_ON(task.node == nullptr || (task.task == nullptr && !is_utility_node(task.node))); INode &node = *task.node; @@ -128,28 +127,30 @@ TaskHandles get_transition_handles(GraphContext &ctx, TaskHandles transition_handles; // Add input handles - for(unsigned int i = 0; i < node.input_edges().size(); ++i) + for (unsigned int i = 0; i < node.input_edges().size(); ++i) { Edge *input_edge = node.input_edge(i); // If this input is the output of another node - if(input_edge != nullptr && input_edge->tensor() != nullptr && const_tensors.find(input_edge->tensor()->handle()->parent_handle()) == std::end(const_tensors)) + if (input_edge != nullptr && input_edge->tensor() != nullptr && + const_tensors.find(input_edge->tensor()->handle()->parent_handle()) == std::end(const_tensors)) { // Then add it to the list of transition buffers ITensorHandle *tensor_handle = input_edge->tensor()->handle()->parent_handle(); - IMemoryGroup *mm_group = get_memory_group_from_handle(ctx, tensor_handle); + IMemoryGroup *mm_group = get_memory_group_from_handle(ctx, tensor_handle); transition_handles.input_handles.emplace_back(std::make_pair(tensor_handle, mm_group)); } } // Add output handles - for(unsigned int i = 0; i < node.num_outputs(); ++i) + for (unsigned int i = 0; i < node.num_outputs(); ++i) { Tensor *output_tensor = node.output(i); // If this output is used as an input for another node - if(output_tensor != nullptr && const_tensors.find(output_tensor->handle()->parent_handle()) == std::end(const_tensors)) + if (output_tensor != nullptr && + const_tensors.find(output_tensor->handle()->parent_handle()) == std::end(const_tensors)) { ITensorHandle *tensor_handle = output_tensor->handle()->parent_handle(); - IMemoryGroup *mm_group = get_memory_group_from_handle(ctx, tensor_handle); + IMemoryGroup *mm_group = get_memory_group_from_handle(ctx, tensor_handle); transition_handles.output_handles.emplace_back(std::make_pair(tensor_handle, mm_group)); } } @@ -164,11 +165,11 @@ TaskHandles get_transition_handles(GraphContext &ctx, */ void count_input_handles_per_target(const TaskHandles &task_handles, TargetHandleCounter &handle_counter) { - for(const auto &handle : task_handles.input_handles) + for (const auto &handle : task_handles.input_handles) { ITensorHandle *key = handle.first; HandleCounter &target_counter = handle_counter[key->target()]; - if(target_counter.find(key) == std::end(target_counter)) + if (target_counter.find(key) == std::end(target_counter)) { target_counter.emplace(std::make_pair(key, 1)); } @@ -192,12 +193,12 @@ void configure_handle_lifetime(std::vector &tasks_handles, const Ha // Acquires the given handles and sets them as in flight if they aren't already auto acquire = [&](std::vector> &handles) { - for(auto &handle : handles) + for (auto &handle : handles) { ITensorHandle *parent_handle = handle.first; ARM_COMPUTE_ERROR_ON(parent_handle == nullptr); // If the tensor is not already in flight: - if(tensors_in_flight.find(parent_handle) == std::end(tensors_in_flight)) + if (tensors_in_flight.find(parent_handle) == std::end(tensors_in_flight)) { ARM_COMPUTE_ERROR_ON(hc.find(parent_handle) == std::end(hc)); // Then add it to the list of in flight tensors @@ -208,20 +209,20 @@ void configure_handle_lifetime(std::vector &tasks_handles, const Ha } }; - for(auto &task_handle : tasks_handles) + for (auto &task_handle : tasks_handles) { // Marking all the input and output tensors of the task as in flight acquire(task_handle.input_handles); acquire(task_handle.output_handles); // Releasing the input tensors - for(auto &input_handle : task_handle.input_handles) + for (auto &input_handle : task_handle.input_handles) { ITensorHandle *ihandle = input_handle.first; ARM_COMPUTE_ERROR_ON(ihandle == nullptr); ARM_COMPUTE_ERROR_ON(tensors_in_flight.find(ihandle) == std::end(tensors_in_flight)); --tensors_in_flight[ihandle]; - if(tensors_in_flight[ihandle] <= 0) + if (tensors_in_flight[ihandle] <= 0) { // Remove tensor for tensors in flight tensors_in_flight.erase(ihandle); @@ -242,7 +243,7 @@ void configure_transition_manager(Graph &g, GraphContext &ctx, ExecutionWorkload TargetHandleCounter target_handle_count; // Count handles - for(auto &task : workload.tasks) + for (auto &task : workload.tasks) { // Populates IO handles tasks_handles.push_back(get_transition_handles(ctx, task, const_tensors)); @@ -252,12 +253,12 @@ void configure_transition_manager(Graph &g, GraphContext &ctx, ExecutionWorkload } // Setup memory managers - for(auto &hc : target_handle_count) + for (auto &hc : target_handle_count) { MemoryManagerContext *mm_ctx = ctx.memory_management_ctx(hc.first); - if(mm_ctx != nullptr) + if (mm_ctx != nullptr) { - if(mm_ctx->cross_mm != nullptr && mm_ctx->cross_group != nullptr) + if (mm_ctx->cross_mm != nullptr && mm_ctx->cross_group != nullptr) { // Manage and allocate tensors configure_handle_lifetime(tasks_handles, hc.second); diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp index ac800df76c..870d24a6c7 100644 --- a/src/graph/detail/ExecutionHelpers.cpp +++ b/src/graph/detail/ExecutionHelpers.cpp @@ -23,12 +23,12 @@ */ #include "arm_compute/graph/detail/ExecutionHelpers.h" +#include "arm_compute/graph/backends/BackendRegistry.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/GraphManager.h" #include "arm_compute/graph/Tensor.h" #include "arm_compute/graph/Utils.h" -#include "arm_compute/graph/backends/BackendRegistry.h" namespace arm_compute { @@ -41,9 +41,9 @@ void validate_all_nodes(Graph &g) auto &nodes = g.nodes(); // Create tasks - for(auto &node : nodes) + for (auto &node : nodes) { - if(node != nullptr) + if (node != nullptr) { Target assigned_target = node->assigned_target(); backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(assigned_target); @@ -57,9 +57,9 @@ void configure_all_tensors(Graph &g) { auto &tensors = g.tensors(); - for(auto &tensor : tensors) + for (auto &tensor : tensors) { - if(tensor && tensor->handle() == nullptr) + if (tensor && tensor->handle() == nullptr) { Target target = tensor->desc().target; backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(target); @@ -72,10 +72,10 @@ void configure_all_tensors(Graph &g) void allocate_all_input_tensors(INode &node) { - for(unsigned int i = 0; i < node.num_inputs(); ++i) + for (unsigned int i = 0; i < node.num_inputs(); ++i) { Tensor *tensor = node.input(i); - if(tensor != nullptr && !tensor->bound_edges().empty()) + if (tensor != nullptr && !tensor->bound_edges().empty()) { ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!"); tensor->handle()->allocate(); @@ -85,10 +85,10 @@ void allocate_all_input_tensors(INode &node) void allocate_all_output_tensors(INode &node) { - for(unsigned int i = 0; i < node.num_outputs(); ++i) + for (unsigned int i = 0; i < node.num_outputs(); ++i) { Tensor *tensor = node.output(i); - if(tensor != nullptr && !tensor->bound_edges().empty()) + if (tensor != nullptr && !tensor->bound_edges().empty()) { ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!"); tensor->handle()->allocate(); @@ -98,11 +98,11 @@ void allocate_all_output_tensors(INode &node) void allocate_const_tensors(Graph &g) { - for(auto &node : g.nodes()) + for (auto &node : g.nodes()) { - if(node != nullptr) + if (node != nullptr) { - switch(node->type()) + switch (node->type()) { case NodeType::Const: case NodeType::Input: @@ -121,9 +121,10 @@ void allocate_all_tensors(Graph &g) { auto &tensors = g.tensors(); - for(auto &tensor : tensors) + for (auto &tensor : tensors) { - if(tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr && tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used()) + if (tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr && + tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used()) { tensor->handle()->allocate(); } @@ -140,15 +141,15 @@ ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx, const std::ve workload.tasks.reserve(node_order.size()); // Create tasks - for(auto &node_id : node_order) + for (auto &node_id : node_order) { auto node = g.node(node_id); - if(node != nullptr) + if (node != nullptr) { Target assigned_target = node->assigned_target(); - backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(assigned_target); + backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(assigned_target); std::unique_ptr func = backend.configure_node(*node, ctx); - if(func != nullptr || is_utility_node(node)) + if (func != nullptr || is_utility_node(node)) { workload.tasks.emplace_back(ExecutionTask(std::move(func), node)); } @@ -156,14 +157,14 @@ ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx, const std::ve } // Add inputs and outputs - for(auto &node : g.nodes()) + for (auto &node : g.nodes()) { - if(node != nullptr && node->type() == NodeType::Input) + if (node != nullptr && node->type() == NodeType::Input) { workload.inputs.push_back(node->output(0)); } - if(node != nullptr && node->type() == NodeType::Output) + if (node != nullptr && node->type() == NodeType::Output) { workload.outputs.push_back(node->input(0)); continue; @@ -175,9 +176,9 @@ ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx, const std::ve void release_unused_tensors(Graph &g) { - for(auto &tensor : g.tensors()) + for (auto &tensor : g.tensors()) { - if(tensor != nullptr && tensor->handle() != nullptr) + if (tensor != nullptr && tensor->handle() != nullptr) { tensor->handle()->release_if_unused(); } @@ -194,11 +195,11 @@ void call_all_const_node_accessors(Graph &g) { auto &nodes = g.nodes(); - for(auto &node : nodes) + for (auto &node : nodes) { - if(node != nullptr && node->type() == NodeType::Const && node->num_outputs()) + if (node != nullptr && node->type() == NodeType::Const && node->num_outputs()) { - if(!node->output(0)->bound_edges().empty()) + if (!node->output(0)->bound_edges().empty()) { call_tensor_accessor(node->output(0)); } @@ -209,18 +210,19 @@ void call_all_const_node_accessors(Graph &g) bool call_all_input_node_accessors(ExecutionWorkload &workload) { bool is_valid = true; - std::for_each(std::begin(workload.inputs), std::end(workload.inputs), [&](Tensor * input_tensor) - { - bool valid_input = (input_tensor != nullptr) && input_tensor->call_accessor(); - is_valid = is_valid && valid_input; - }); + std::for_each(std::begin(workload.inputs), std::end(workload.inputs), + [&](Tensor *input_tensor) + { + bool valid_input = (input_tensor != nullptr) && input_tensor->call_accessor(); + is_valid = is_valid && valid_input; + }); return is_valid; } void prepare_all_tasks(ExecutionWorkload &workload) { ARM_COMPUTE_ERROR_ON(workload.graph == nullptr); - for(auto &task : workload.tasks) + for (auto &task : workload.tasks) { task.prepare(); release_unused_tensors(*workload.graph); @@ -232,24 +234,24 @@ void call_all_tasks(ExecutionWorkload &workload) ARM_COMPUTE_ERROR_ON(workload.ctx == nullptr); // Acquire memory for the transition buffers - for(auto &mm_ctx : workload.ctx->memory_managers()) + for (auto &mm_ctx : workload.ctx->memory_managers()) { - if(mm_ctx.second.cross_group != nullptr) + if (mm_ctx.second.cross_group != nullptr) { mm_ctx.second.cross_group->acquire(); } } // Execute tasks - for(auto &task : workload.tasks) + for (auto &task : workload.tasks) { task(); } // Release memory for the transition buffers - for(auto &mm_ctx : workload.ctx->memory_managers()) + for (auto &mm_ctx : workload.ctx->memory_managers()) { - if(mm_ctx.second.cross_group != nullptr) + if (mm_ctx.second.cross_group != nullptr) { mm_ctx.second.cross_group->release(); } @@ -259,11 +261,12 @@ void call_all_tasks(ExecutionWorkload &workload) bool call_all_output_node_accessors(ExecutionWorkload &workload) { bool is_valid = true; - std::for_each(std::begin(workload.outputs), std::end(workload.outputs), [&](Tensor * output_tensor) - { - bool valid_output = (output_tensor != nullptr) && output_tensor->call_accessor(); - is_valid = is_valid && valid_output; - }); + std::for_each(std::begin(workload.outputs), std::end(workload.outputs), + [&](Tensor *output_tensor) + { + bool valid_output = (output_tensor != nullptr) && output_tensor->call_accessor(); + is_valid = is_valid && valid_output; + }); sync_backends(); diff --git a/src/graph/frontend/Stream.cpp b/src/graph/frontend/Stream.cpp index 44c8400874..383a6dc67f 100644 --- a/src/graph/frontend/Stream.cpp +++ b/src/graph/frontend/Stream.cpp @@ -23,8 +23,8 @@ */ #include "arm_compute/graph/frontend/Stream.h" -#include "arm_compute/graph/Utils.h" #include "arm_compute/graph/frontend/ILayer.h" +#include "arm_compute/graph/Utils.h" namespace arm_compute { @@ -32,8 +32,7 @@ namespace graph { namespace frontend { -Stream::Stream(size_t id, std::string name) - : _ctx(), _manager(), _g(id, std::move(name)) +Stream::Stream(size_t id, std::string name) : _ctx(), _manager(), _g(id, std::move(name)) { } diff --git a/src/graph/frontend/SubStream.cpp b/src/graph/frontend/SubStream.cpp index 4b42207e80..8596aaa1a3 100644 --- a/src/graph/frontend/SubStream.cpp +++ b/src/graph/frontend/SubStream.cpp @@ -23,8 +23,8 @@ */ #include "arm_compute/graph/frontend/SubStream.h" -#include "arm_compute/graph/Graph.h" #include "arm_compute/graph/frontend/ILayer.h" +#include "arm_compute/graph/Graph.h" namespace arm_compute { @@ -32,8 +32,7 @@ namespace graph { namespace frontend { -SubStream::SubStream(IStream &s) - : _s(s) +SubStream::SubStream(IStream &s) : _s(s) { _hints = s.hints(); _tail_node = s.tail_node(); diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp index 963b948432..1b7ee3c4a4 100644 --- a/src/graph/mutators/DepthConcatSubTensorMutator.cpp +++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp @@ -23,12 +23,12 @@ */ #include "arm_compute/graph/mutators/DepthConcatSubTensorMutator.h" -#include "arm_compute/graph/Graph.h" -#include "arm_compute/graph/Logger.h" -#include "arm_compute/graph/Utils.h" #include "arm_compute/graph/algorithms/TopologicalSort.h" #include "arm_compute/graph/backends/BackendRegistry.h" +#include "arm_compute/graph/Graph.h" +#include "arm_compute/graph/Logger.h" #include "arm_compute/graph/nodes/ConcatenateLayerNode.h" +#include "arm_compute/graph/Utils.h" #include "support/Cast.h" #include "support/Iterable.h" @@ -50,7 +50,7 @@ IGraphMutator::MutationType DepthConcatSubTensorMutator::type() const void DepthConcatSubTensorMutator::mutate(Graph &g) { // Early exit if no Concatenation layers exist in graph - if(g.nodes(NodeType::ConcatenateLayer).empty()) + if (g.nodes(NodeType::ConcatenateLayer).empty()) { return; } @@ -59,43 +59,48 @@ void DepthConcatSubTensorMutator::mutate(Graph &g) std::vector topological_sorted_node_ids = dfs(g); // Should be in reverse order of execution - for(auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids)) + for (auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids)) { INode *node = g.node(node_id); - if(node != nullptr && node->type() == NodeType::ConcatenateLayer && node->output(0) != nullptr) + if (node != nullptr && node->type() == NodeType::ConcatenateLayer && node->output(0) != nullptr) { // Get output tensor auto output_tensor = node->output(0); // Check concatenation axis (Sub-tensor optimization is supported for concatenation axis >=2) auto *concat_node = arm_compute::utils::cast::polymorphic_downcast(node); - if(output_tensor == nullptr || get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2) + if (output_tensor == nullptr || + get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2) { continue; } // Check that all tensor have the same target, valid inputs and same quantization info - bool is_valid = std::all_of(node->input_edges().cbegin(), node->input_edges().cend(), - [&](const EdgeID & eid) - { - return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target) - && (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info); - }); + bool is_valid = + std::all_of(node->input_edges().cbegin(), node->input_edges().cend(), + [&](const EdgeID &eid) + { + return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && + (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target) && + (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info); + }); // Create subtensors - if(is_valid && is_target_supported(output_tensor->desc().target)) + if (is_valid && is_target_supported(output_tensor->desc().target)) { ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : " << node->id() << " and name : " << node->name() << std::endl); // Create sub-tensor handles unsigned depth = 0; - for(unsigned int i = 0; i < node->input_edges().size(); ++i) + for (unsigned int i = 0; i < node->input_edges().size(); ++i) { auto input_tensor = node->input(i); const auto input_shape = input_tensor->desc().shape; - backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(input_tensor->desc().target); - std::unique_ptr handle = backend.create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false); + backends::IDeviceBackend &backend = + backends::BackendRegistry::get().get_backend(input_tensor->desc().target); + std::unique_ptr handle = + backend.create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false); input_tensor->set_handle(std::move(handle)); depth += input_shape.z(); diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp index b7c551ce8b..31efba6bb1 100644 --- a/src/graph/mutators/GroupedConvolutionMutator.cpp +++ b/src/graph/mutators/GroupedConvolutionMutator.cpp @@ -23,15 +23,14 @@ */ #include "arm_compute/graph/mutators/GroupedConvolutionMutator.h" +#include "arm_compute/graph/backends/BackendRegistry.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/GraphBuilder.h" #include "arm_compute/graph/Logger.h" -#include "arm_compute/graph/Utils.h" -#include "arm_compute/graph/backends/BackendRegistry.h" #include "arm_compute/graph/nodes/Nodes.h" +#include "arm_compute/graph/Utils.h" #include "support/Cast.h" - #include "support/StringSupport.h" #include @@ -42,43 +41,51 @@ namespace graph { namespace { -NodeID create_grouped_convolution(Graph &g, const NodeParams ¶ms, NodeIdxPair input, NodeID weights, NodeID bias, - PadStrideInfo conv_info, ConvolutionMethod method, ActivationLayerInfo fused_act, FastMathHint fast_math_hint, unsigned int num_groups) +NodeID create_grouped_convolution(Graph &g, + const NodeParams ¶ms, + NodeIdxPair input, + NodeID weights, + NodeID bias, + PadStrideInfo conv_info, + ConvolutionMethod method, + ActivationLayerInfo fused_act, + FastMathHint fast_math_hint, + unsigned int num_groups) { bool has_bias = (bias != EmptyNodeID); // Split input const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]); - const unsigned int input_idx = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL); - NodeID input_split = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx); + const unsigned int input_idx = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL); + NodeID input_split = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx); // Split weights const TensorDescriptor weights_tensor_desc = get_tensor_descriptor(g, g.node(weights)->outputs()[0]); - const unsigned int batch_idx = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES); - NodeID weights_split = GraphBuilder::add_split_node(g, params, { weights, 0 }, num_groups, batch_idx); + const unsigned int batch_idx = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES); + NodeID weights_split = GraphBuilder::add_split_node(g, params, {weights, 0}, num_groups, batch_idx); // Split bias NodeID bias_split = EmptyNodeID; - if(has_bias) + if (has_bias) { // Split bias - bias_split = GraphBuilder::add_split_node(g, params, { bias, 0 }, num_groups, 0); + bias_split = GraphBuilder::add_split_node(g, params, {bias, 0}, num_groups, 0); } std::vector convolution_outputs; - for(unsigned int i = 0; i < num_groups; ++i) + for (unsigned int i = 0; i < num_groups; ++i) { NodeParams group_params = params; NodeID conv_nid = g.add_node(conv_info, 1, method, fast_math_hint); g.add_connection(input_split, i, conv_nid, 0); g.add_connection(weights_split, i, conv_nid, 1); - if(has_bias) + if (has_bias) { g.add_connection(bias_split, i, conv_nid, 2); } // Add group name - if(!group_params.name.empty()) + if (!group_params.name.empty()) { group_params.name.append("_g" + arm_compute::support::cpp11::to_string(i)); } @@ -92,7 +99,7 @@ NodeID create_grouped_convolution(Graph &g, const NodeParams ¶ms, NodeIdxPai auto *conv_node = arm_compute::utils::cast::polymorphic_downcast(node); conv_node->set_fused_activation(fused_act); - convolution_outputs.push_back({ conv_nid, 0 }); + convolution_outputs.push_back({conv_nid, 0}); } // Depth concatenate output @@ -113,7 +120,7 @@ IGraphMutator::MutationType GroupedConvolutionMutator::type() const void GroupedConvolutionMutator::mutate(Graph &g) { // Early exit if no Convolution layers exist in graph - if(g.nodes(NodeType::ConvolutionLayer).empty()) + if (g.nodes(NodeType::ConvolutionLayer).empty()) { return; } @@ -122,17 +129,18 @@ void GroupedConvolutionMutator::mutate(Graph &g) size_t total_nodes = g.nodes().size(); // Iterate over convolution nodes - for(unsigned int i = 0; i < total_nodes; ++i) + for (unsigned int i = 0; i < total_nodes; ++i) { INode *node = g.node(i); - if(node != nullptr && node->type() == NodeType::ConvolutionLayer && arm_compute::utils::cast::polymorphic_downcast(node)->num_groups() != 1) + if (node != nullptr && node->type() == NodeType::ConvolutionLayer && + arm_compute::utils::cast::polymorphic_downcast(node)->num_groups() != 1) { // Validate node backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(node->assigned_target()); Status status = backend.validate_node(*node); // If grouped convolution is not supported - if(!bool(status)) + if (!bool(status)) { // Down-cast node auto *conv_node = arm_compute::utils::cast::polymorphic_downcast(node); @@ -151,7 +159,8 @@ void GroupedConvolutionMutator::mutate(Graph &g) ARM_COMPUTE_ERROR_ON(conv_node->input_edge(0) == nullptr || conv_node->input_edge(1) == nullptr); const NodeID input_id = conv_node->input_edge(0)->producer()->id(); const NodeID weights_id = conv_node->input_edge(1)->producer()->id(); - const NodeID bias_id = (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID; + const NodeID bias_id = + (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID; // Get driving nodes std::vector driving_nodes = get_driving_nodes(*node); @@ -164,14 +173,15 @@ void GroupedConvolutionMutator::mutate(Graph &g) NodeID latest_nid = g.nodes().size(); // Create grouped convolution node - NodeID grouped_conv_id = create_grouped_convolution(g, params, { input_id, 0 }, weights_id, bias_id, - conv_info, conv_method, fused_act_info, fast_math_hint, num_groups); + NodeID grouped_conv_id = + create_grouped_convolution(g, params, {input_id, 0}, weights_id, bias_id, conv_info, conv_method, + fused_act_info, fast_math_hint, num_groups); // Remove convolution node g.remove_node(node->id()); // Update batch normalization node outputs - for(auto &driving_node : driving_nodes) + for (auto &driving_node : driving_nodes) { g.add_connection(grouped_conv_id, 0, driving_node.node_id, driving_node.index); } @@ -180,17 +190,16 @@ void GroupedConvolutionMutator::mutate(Graph &g) g.node(grouped_conv_id)->output(0)->set_accessor(std::move(node_accessor)); // Configure new tensors and nodes - std::for_each(g.tensors().begin() + latest_tid, g.tensors().end(), [](std::unique_ptr &t) - { - configure_tensor(t.get()); - }); - std::for_each(g.nodes().begin() + latest_nid, g.nodes().end(), [&assigned_target](std::unique_ptr &n) - { - if(n != nullptr) - { - n->set_assigned_target(assigned_target); - } - }); + std::for_each(g.tensors().begin() + latest_tid, g.tensors().end(), + [](std::unique_ptr &t) { configure_tensor(t.get()); }); + std::for_each(g.nodes().begin() + latest_nid, g.nodes().end(), + [&assigned_target](std::unique_ptr &n) + { + if (n != nullptr) + { + n->set_assigned_target(assigned_target); + } + }); } } } diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp index d3ea940895..a51dcc4f42 100644 --- a/src/graph/mutators/InPlaceOperationMutator.cpp +++ b/src/graph/mutators/InPlaceOperationMutator.cpp @@ -29,6 +29,7 @@ #include "arm_compute/graph/Logger.h" #include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h" #include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h" + #include "support/Cast.h" using namespace arm_compute::utils::cast; @@ -48,7 +49,7 @@ bool output_edges_are_separate_tensors(Graph &g, const Edge *input_edge) const auto input_tensor = input_edge->tensor(); const auto input_edge_id = input_edge->id(); - if(parent_node == nullptr) + if (parent_node == nullptr) { return false; } @@ -57,24 +58,23 @@ bool output_edges_are_separate_tensors(Graph &g, const Edge *input_edge) // If the output is connected to only one edge, then computations can // be done in-place. - if(output_edges.size() == 1) + if (output_edges.size() == 1) { return true; } - return std::all_of(output_edges.begin(), - output_edges.end(), - [&](const EdgeID & edge_id) - { - // Skip check on current input edge - if(edge_id == input_edge_id) - { - return true; - } - - auto edge = g.edge(edge_id); - return edge->tensor() != input_tensor; - }); + return std::all_of(output_edges.begin(), output_edges.end(), + [&](const EdgeID &edge_id) + { + // Skip check on current input edge + if (edge_id == input_edge_id) + { + return true; + } + + auto edge = g.edge(edge_id); + return edge->tensor() != input_tensor; + }); } // If do in-place calculation, then need to use the new output and inherit original output's accessor @@ -109,12 +109,14 @@ void try_in_place_depthwiseconv(std::unique_ptr &node) // Extract PadStrideInfo and depth multiplier PadStrideInfo conv_info{}; unsigned int depth_multiplier{}; - if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer) + if (node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer) { - conv_info = polymorphic_downcast(node.get())->convolution_info(); - depth_multiplier = polymorphic_downcast(node.get())->depth_multiplier(); + conv_info = + polymorphic_downcast(node.get())->convolution_info(); + depth_multiplier = + polymorphic_downcast(node.get())->depth_multiplier(); } - else if(node->type() == NodeType::DepthwiseConvolutionLayer) + else if (node->type() == NodeType::DepthwiseConvolutionLayer) { conv_info = polymorphic_downcast(node.get())->convolution_info(); depth_multiplier = polymorphic_downcast(node.get())->depth_multiplier(); @@ -126,7 +128,8 @@ void try_in_place_depthwiseconv(std::unique_ptr &node) const auto out_shape = current_output_tensor->desc().shape; const auto qinfo_out = current_output_tensor->desc().quant_info; - bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) && (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr); + bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) && + (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr); // Specify conditions with which input can be in-placed input_can_in_place &= weight_layout == input_tensor->desc().layout && weight_layout == DataLayout::NHWC; @@ -141,13 +144,14 @@ void try_in_place_depthwiseconv(std::unique_ptr &node) input_can_in_place &= !conv_info.has_padding(); // NOTE: Dilation should also be (1, 1). However currently dilation is not supported in the depthwise conv node - if(input_can_in_place) + if (input_can_in_place) { set_new_output_and_inherit_accessor(node, current_output_tensor, input_tensor); } else { - ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n"); + ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor " + "or the quantization info are different.\n"); } } @@ -170,7 +174,7 @@ void try_in_place_elementwise(std::unique_ptr &node) const TensorShape out_shape = TensorShape::broadcast_shape(shape0, shape1); // Inputs are not broadcast compatible - if(out_shape.total_size() == 0) + if (out_shape.total_size() == 0) { return; } @@ -181,22 +185,27 @@ void try_in_place_elementwise(std::unique_ptr &node) const auto qinfo_out = current_output_tensor->desc().quant_info; // Can do in place, if the input has same shape as output, has same quntisation info as output, has same data type as output and input doesn't have accessor. - bool input0_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) && (qinfo0 == qinfo_out) - && (input0_tensor->desc().data_type == current_output_tensor->desc().data_type) && (input0_tensor->accessor() == nullptr); - bool input1_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) && (qinfo1 == qinfo_out) - && (input1_tensor->desc().data_type == current_output_tensor->desc().data_type) && (input1_tensor->accessor() == nullptr); - - if(input0_can_in_place) + bool input0_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) && + (qinfo0 == qinfo_out) && + (input0_tensor->desc().data_type == current_output_tensor->desc().data_type) && + (input0_tensor->accessor() == nullptr); + bool input1_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) && + (qinfo1 == qinfo_out) && + (input1_tensor->desc().data_type == current_output_tensor->desc().data_type) && + (input1_tensor->accessor() == nullptr); + + if (input0_can_in_place) { set_new_output_and_inherit_accessor(node, current_output_tensor, input0_tensor); } - else if(input1_can_in_place) + else if (input1_can_in_place) { set_new_output_and_inherit_accessor(node, current_output_tensor, input1_tensor); } else { - ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n"); + ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor " + "or the quantization info are different.\n"); } } } // namespace @@ -213,33 +222,31 @@ IGraphMutator::MutationType InPlaceOperationMutator::type() const void InPlaceOperationMutator::mutate(Graph &g) { - std::set in_place_nodes = - { - NodeType::ActivationLayer, - NodeType::BatchNormalizationLayer, - NodeType::EltwiseLayer, - NodeType::UnaryEltwiseLayer, - NodeType::DepthwiseConvolutionLayer, - NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer, - NodeType::PrintLayer - }; + std::set in_place_nodes = {NodeType::ActivationLayer, + NodeType::BatchNormalizationLayer, + NodeType::EltwiseLayer, + NodeType::UnaryEltwiseLayer, + NodeType::DepthwiseConvolutionLayer, + NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer, + NodeType::PrintLayer}; // Not interested in the order of nodes - for(auto &node : g.nodes()) + for (auto &node : g.nodes()) { - if(node && in_place_nodes.find(node->type()) != std::end(in_place_nodes)) + if (node && in_place_nodes.find(node->type()) != std::end(in_place_nodes)) { // Get input edge Edge *input_edge = node->input_edge(0); // Check if parent has a single output if yes then force in place calculation else not - if((input_edge != nullptr) && output_edges_are_separate_tensors(g, input_edge)) + if ((input_edge != nullptr) && output_edges_are_separate_tensors(g, input_edge)) { - if(node->type() == NodeType::EltwiseLayer) + if (node->type() == NodeType::EltwiseLayer) { try_in_place_elementwise(node); } - else if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer || node->type() == NodeType::DepthwiseConvolutionLayer) + else if (node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer || + node->type() == NodeType::DepthwiseConvolutionLayer) { try_in_place_depthwiseconv(node); } @@ -252,9 +259,11 @@ void InPlaceOperationMutator::mutate(Graph &g) ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr || new_output_tensor == nullptr); // Prevent in-place operation if there is an accessor bound to the in-place tensor or quantization info are different - if(new_output_tensor->accessor() != nullptr || current_output_tensor->desc().quant_info != new_output_tensor->desc().quant_info) + if (new_output_tensor->accessor() != nullptr || + current_output_tensor->desc().quant_info != new_output_tensor->desc().quant_info) { - ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n"); + ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to " + "the input tensor or the quantization info are different.\n"); } else { diff --git a/src/graph/mutators/MutatorUtils.cpp b/src/graph/mutators/MutatorUtils.cpp index c8f38f34e7..f47240eadd 100644 --- a/src/graph/mutators/MutatorUtils.cpp +++ b/src/graph/mutators/MutatorUtils.cpp @@ -29,14 +29,14 @@ namespace graph { bool is_padding_in_height_or_width(const DataLayout &layout, const PaddingList &padding_list) { - if(layout == DataLayout::NCHW || layout == DataLayout::NHWC) + if (layout == DataLayout::NCHW || layout == DataLayout::NHWC) { const unsigned int height_index = get_dimension_idx(layout, DataLayoutDimension::HEIGHT); const unsigned int width_index = get_dimension_idx(layout, DataLayoutDimension::WIDTH); - for(unsigned int i = 0; i < padding_list.size(); ++i) + for (unsigned int i = 0; i < padding_list.size(); ++i) { - if(i != height_index && i != width_index && padding_list[i] != PaddingInfo(0, 0)) + if (i != height_index && i != width_index && padding_list[i] != PaddingInfo(0, 0)) { // if the index is not either height or width, don't fuse return false; @@ -49,4 +49,4 @@ bool is_padding_in_height_or_width(const DataLayout &layout, const PaddingList & return false; } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/mutators/NodeExecutionMethodMutator.cpp b/src/graph/mutators/NodeExecutionMethodMutator.cpp index 09a3cf50c0..588befecae 100644 --- a/src/graph/mutators/NodeExecutionMethodMutator.cpp +++ b/src/graph/mutators/NodeExecutionMethodMutator.cpp @@ -23,11 +23,11 @@ */ #include "arm_compute/graph/mutators/NodeExecutionMethodMutator.h" +#include "arm_compute/graph/backends/BackendRegistry.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/Logger.h" -#include "arm_compute/graph/Utils.h" -#include "arm_compute/graph/backends/BackendRegistry.h" #include "arm_compute/graph/nodes/Nodes.h" +#include "arm_compute/graph/Utils.h" #include "support/Cast.h" @@ -49,17 +49,17 @@ template void set_default_on_invalid_method(Graph &g, NodeType node_type, Setter &&setter) { const std::vector &node_ids = g.nodes(node_type); - for(auto &node_id : node_ids) + for (auto &node_id : node_ids) { INode *node = g.node(node_id); - if(node != nullptr) + if (node != nullptr) { // Validate node backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(node->assigned_target()); Status status = backend.validate_node(*node); // Set default execution method in case of failure - if(!bool(status)) + if (!bool(status)) { setter(node); } @@ -81,22 +81,26 @@ IGraphMutator::MutationType NodeExecutionMethodMutator::type() const void NodeExecutionMethodMutator::mutate(Graph &g) { // Convolution Layer - set_default_on_invalid_method(g, NodeType::ConvolutionLayer, [](INode * n) - { - ARM_COMPUTE_LOG_GRAPH_INFO("Switched ConvolutionLayer method of node with ID : " - << n->id() << " and Name: " << n->name() << std::endl); - auto *casted_node = arm_compute::utils::cast::polymorphic_downcast(n); - casted_node->set_convolution_method(ConvolutionMethod::Default); - }); + set_default_on_invalid_method(g, NodeType::ConvolutionLayer, + [](INode *n) + { + ARM_COMPUTE_LOG_GRAPH_INFO("Switched ConvolutionLayer method of node with ID : " + << n->id() << " and Name: " << n->name() << std::endl); + auto *casted_node = + arm_compute::utils::cast::polymorphic_downcast(n); + casted_node->set_convolution_method(ConvolutionMethod::Default); + }); // Depthwise Convolution Layer - set_default_on_invalid_method(g, NodeType::DepthwiseConvolutionLayer, [](INode * n) - { - ARM_COMPUTE_LOG_GRAPH_INFO("Switched Depthwise ConvolutionLayer method of node with ID : " - << n->id() << " and Name: " << n->name() << std::endl); - auto *casted_node = arm_compute::utils::cast::polymorphic_downcast(n); - casted_node->set_depthwise_convolution_method(DepthwiseConvolutionMethod::Default); - }); + set_default_on_invalid_method( + g, NodeType::DepthwiseConvolutionLayer, + [](INode *n) + { + ARM_COMPUTE_LOG_GRAPH_INFO("Switched Depthwise ConvolutionLayer method of node with ID : " + << n->id() << " and Name: " << n->name() << std::endl); + auto *casted_node = arm_compute::utils::cast::polymorphic_downcast(n); + casted_node->set_depthwise_convolution_method(DepthwiseConvolutionMethod::Default); + }); } } // namespace graph } // namespace arm_compute diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp index 38284b93cf..998a4a05c7 100644 --- a/src/graph/mutators/NodeFusionMutator.cpp +++ b/src/graph/mutators/NodeFusionMutator.cpp @@ -24,15 +24,14 @@ #include "arm_compute/graph/mutators/NodeFusionMutator.h" #include "arm_compute/core/utils/DataTypeUtils.h" +#include "arm_compute/graph/backends/BackendRegistry.h" #include "arm_compute/graph/GraphBuilder.h" #include "arm_compute/graph/Logger.h" -#include "arm_compute/graph/Utils.h" -#include "arm_compute/graph/backends/BackendRegistry.h" #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h" #include "arm_compute/graph/nodes/Nodes.h" +#include "arm_compute/graph/Utils.h" #include "src/graph/mutators/MutatorUtils.h" - #include "support/Cast.h" #include @@ -46,7 +45,7 @@ namespace detail { void transfer_driving_nodes_and_remove_old_node(Graph &g, INode *new_node, INode *old_node, bool add_output_tensor) { - if(new_node == nullptr || old_node == nullptr) + if (new_node == nullptr || old_node == nullptr) { return; } @@ -55,7 +54,7 @@ void transfer_driving_nodes_and_remove_old_node(Graph &g, INode *new_node, INode std::vector last_driving_nodes = get_driving_nodes(*old_node); // Extract last fusable node accessor if any - if(old_node->output(0) == nullptr) + if (old_node->output(0) == nullptr) { return; } @@ -65,10 +64,10 @@ void transfer_driving_nodes_and_remove_old_node(Graph &g, INode *new_node, INode g.remove_node(old_node->id()); // Update fused node outputs - for(auto &driving_node : last_driving_nodes) + for (auto &driving_node : last_driving_nodes) { g.add_connection(new_node->id(), 0, driving_node.node_id, driving_node.index); - if(add_output_tensor) + if (add_output_tensor) { configure_tensor(new_node->output(0)); } @@ -83,19 +82,21 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge ARM_COMPUTE_ERROR_ON(output_edge == nullptr); auto *conv_node = arm_compute::utils::cast::polymorphic_downcast(output_edge->producer()); - auto *bn_node = arm_compute::utils::cast::polymorphic_downcast(output_edge->consumer()); + auto *bn_node = + arm_compute::utils::cast::polymorphic_downcast(output_edge->consumer()); // Not fusing if number of groups is greater than 1 - if(conv_node->num_groups() > 1) + if (conv_node->num_groups() > 1) { return; } - ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << output_edge->producer_id() - << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl); + ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " + << output_edge->producer_id() << " with BatchNormalization Layer node with ID : " + << output_edge->consumer_id() << std::endl); // Prevent fusion if fused node has an output accessor - if(conv_node->output(0)->accessor() == nullptr) + if (conv_node->output(0)->accessor() == nullptr) { const Target assigned_target = conv_node->assigned_target(); @@ -115,9 +116,10 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge const auto epsilon = bn_node->epsilon(); // Create the fused node - const NodeID fused_id = g.add_node(epsilon, conv_info, num_groups, conv_method, fast_math_hint, act_info); + const NodeID fused_id = g.add_node( + epsilon, conv_info, num_groups, conv_method, fast_math_hint, act_info); - if(conv_node->input_edge(2) != nullptr) + if (conv_node->input_edge(2) != nullptr) { auto conv_bias_id = conv_node->input_edge(2)->producer_id(); g.add_connection(conv_bias_id, 0, fused_id, 2); @@ -129,13 +131,13 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge g.add_connection(bn_mean_id, 0, fused_id, 3); g.add_connection(bn_var_id, 0, fused_id, 4); - if(bn_node->input_edge(3) != nullptr) + if (bn_node->input_edge(3) != nullptr) { const auto bn_beta_id = bn_node->input_edge(3)->producer_id(); g.add_connection(bn_beta_id, 0, fused_id, 5); } - if(bn_node->input_edge(4) != nullptr) + if (bn_node->input_edge(4) != nullptr) { const auto bn_gamma_id = bn_node->input_edge(4)->producer_id(); g.add_connection(bn_gamma_id, 0, fused_id, 6); @@ -147,14 +149,15 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge transfer_driving_nodes_and_remove_old_node(g, fused_node, bn_node, true); fused_node->set_assigned_target(assigned_target); - fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + bn_node_name, assigned_target }); + fused_node->set_common_node_parameters(NodeParams{conv_node->name() + "+" + bn_node_name, assigned_target}); // Remove convolution node g.remove_node(conv_node->id()); } else { - ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n"); + ARM_COMPUTE_LOG_GRAPH_VERBOSE( + "Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n"); } } @@ -162,14 +165,17 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o { ARM_COMPUTE_ERROR_ON(output_edge == nullptr); - auto *depth_conv_node = arm_compute::utils::cast::polymorphic_downcast(output_edge->producer()); - auto *bn_node = arm_compute::utils::cast::polymorphic_downcast(output_edge->consumer()); + auto *depth_conv_node = + arm_compute::utils::cast::polymorphic_downcast(output_edge->producer()); + auto *bn_node = + arm_compute::utils::cast::polymorphic_downcast(output_edge->consumer()); - ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing depthwise convolution node with ID : " << output_edge->producer_id() - << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl); + ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing depthwise convolution node with ID : " + << output_edge->producer_id() << " with BatchNormalization Layer node with ID : " + << output_edge->consumer_id() << std::endl); // Prevent fusion if fused node has an output accessor - if(depth_conv_node->output(0)->accessor() == nullptr) + if (depth_conv_node->output(0)->accessor() == nullptr) { const Target assigned_target = depth_conv_node->assigned_target(); @@ -189,9 +195,10 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o const auto epsilon = bn_node->epsilon(); // Create the fused node - const NodeID fused_id = g.add_node(epsilon, conv_info, depth_multiplier, depth_conv_method, act_info); + const NodeID fused_id = g.add_node( + epsilon, conv_info, depth_multiplier, depth_conv_method, act_info); - if(depth_conv_node->input_edge(2) != nullptr) + if (depth_conv_node->input_edge(2) != nullptr) { const auto conv_bias_id = depth_conv_node->input_edge(2)->producer_id(); g.add_connection(conv_bias_id, 0, fused_id, 2); @@ -211,19 +218,23 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o transfer_driving_nodes_and_remove_old_node(g, fused_node, bn_node, true); fused_node->set_assigned_target(assigned_target); - fused_node->set_common_node_parameters(NodeParams{ depth_conv_node->name() + "+" + bn_node_name, assigned_target }); + fused_node->set_common_node_parameters( + NodeParams{depth_conv_node->name() + "+" + bn_node_name, assigned_target}); // Remove convolution node g.remove_node(depth_conv_node->id()); } else { - ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of depthwise convolution with batch normalization due to the presence of an output accessor\n"); + ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of depthwise convolution with batch normalization due to the " + "presence of an output accessor\n"); } } template -void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set &supported_fused_activations) +void fuse_node_with_activation(Graph &g, + const Edge *output_edge, + const std::set &supported_fused_activations) { ARM_COMPUTE_ERROR_ON(output_edge == nullptr); @@ -233,22 +244,23 @@ void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr); // Check if activation is supported for fusion - if(supported_fused_activations.count(act_node->activation_info().activation()) == 0) + if (supported_fused_activations.count(act_node->activation_info().activation()) == 0) { return; } // EltwiseLayerNode can only be fused when dataype is float - if(n_node->type() == NodeType::EltwiseLayer && !is_data_type_float(n_node->output(0)->desc().data_type)) + if (n_node->type() == NodeType::EltwiseLayer && !is_data_type_float(n_node->output(0)->desc().data_type)) { return; } ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id() - << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl); + << " with Activation Layer node with ID : " + << output_edge->consumer_id() << std::endl); // Prevent fusion if fused node has an output accessor - if(n_node->output(0)->accessor() == nullptr) + if (n_node->output(0)->accessor() == nullptr) { // Set activation info to fused node n_node->set_fused_activation(act_node->activation_info()); @@ -257,7 +269,8 @@ void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set } else { - ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n"); + ARM_COMPUTE_LOG_GRAPH_VERBOSE( + "Prevented fusion of node with activation due to the presence of an output accessor\n"); } } @@ -268,8 +281,8 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge) auto *conv_node = arm_compute::utils::cast::polymorphic_downcast(output_edge->consumer()); const Edge *input_edge = pad_node->input_edge(0); - if(input_edge != nullptr && input_edge->tensor() != nullptr && pad_node->output(0)->accessor() == nullptr - && pad_node->pad_value().get() == 0.0) + if (input_edge != nullptr && input_edge->tensor() != nullptr && pad_node->output(0)->accessor() == nullptr && + pad_node->pad_value().get() == 0.0) { const DataLayout layout = input_edge->tensor()->desc().layout; const PaddingList padding_list = pad_node->padding(); @@ -280,18 +293,14 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge) const PaddingInfo pad_w = width_index < padding_list.size() ? padding_list[width_index] : PaddingInfo(0, 0); const PaddingInfo pad_h = height_index < padding_list.size() ? padding_list[height_index] : PaddingInfo(0, 0); - if(is_padding_in_height_or_width(layout, padding_list)) + if (is_padding_in_height_or_width(layout, padding_list)) { // Add paddings to the convolution node const PadStrideInfo conv_info = conv_node->convolution_info(); - const PadStrideInfo new_conv_info( - conv_info.stride().first, - conv_info.stride().second, - conv_info.pad_left() + pad_w.first, - conv_info.pad_right() + pad_w.second, - conv_info.pad_top() + pad_h.first, - conv_info.pad_bottom() + pad_h.second, - conv_info.round()); + const PadStrideInfo new_conv_info(conv_info.stride().first, conv_info.stride().second, + conv_info.pad_left() + pad_w.first, conv_info.pad_right() + pad_w.second, + conv_info.pad_top() + pad_h.first, conv_info.pad_bottom() + pad_h.second, + conv_info.round()); conv_node->set_convolution_info(new_conv_info); // Update drivers of the convolution node @@ -299,7 +308,7 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge) g.remove_node(pad_node->id()); // Update fused node inputs - for(auto &driver_node : pad_driver_nodes) + for (auto &driver_node : pad_driver_nodes) { g.add_connection(driver_node.node_id, driver_node.index, conv_node->id(), 0); } @@ -308,22 +317,23 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge) } template -void fuse_layer(Graph &g, std::function const &prec, const F fuse_fcn, Args &&... optional_arguments) +void fuse_layer(Graph &g, std::function const &prec, const F fuse_fcn, Args &&...optional_arguments) { // Note that fused nodes may be added to the end of the node list. // Instead of only looping over the original list of nodes, we loop over the current node list which could be growing. // This is intentional as it probes the newly added fused nodes for further fusing opportunities. - for(unsigned int i = 0; i < g.nodes().size(); ++i) + for (unsigned int i = 0; i < g.nodes().size(); ++i) { auto node = g.node(i); // Check if the node is of type N1 and not a branching node - if(node && node->type() == N1::node_type && node->output_edges().size() == 1) + if (node && node->type() == N1::node_type && node->output_edges().size() == 1) { const auto output_edge_id = *node->output_edges().begin(); const auto output_edge = g.edge(output_edge_id); // Check if following node is a type N2 node - if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer())) + if ((output_edge != nullptr) && (output_edge->consumer() != nullptr) && + (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer())) { fuse_fcn(g, output_edge, optional_arguments...); } @@ -332,22 +342,22 @@ void fuse_layer(Graph &g, std::function const &prec, const F fuse } template -void fuse_layer(Graph &g, std::function const &prec, const F fuse_fcn, Args &&... optional_arguments) +void fuse_layer(Graph &g, std::function const &prec, const F fuse_fcn, Args &&...optional_arguments) { // Note that fused nodes may be added to the end of the node list. // Instead of only looping over the original list of nodes, we loop over the current node list which could be growing. // This is intentional as it probes the newly added fused nodes for further fusing opportunities. - for(unsigned int i = 0; i < g.nodes().size(); ++i) + for (unsigned int i = 0; i < g.nodes().size(); ++i) { auto node = g.node(i); // Check if the node is of type N1 and not a branching node - if(node && node->type() == N1::node_type && node->output_edges().size() == 1) + if (node && node->type() == N1::node_type && node->output_edges().size() == 1) { const auto output_edge_id = *node->output_edges().begin(); const auto output_edge = g.edge(output_edge_id); // Check if it's the correct target - if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && prec(*output_edge->producer())) + if ((output_edge != nullptr) && (output_edge->consumer() != nullptr) && prec(*output_edge->producer())) { fuse_fcn(g, output_edge, i, optional_arguments...); } @@ -369,30 +379,24 @@ IGraphMutator::MutationType NodeFusionMutator::type() const void NodeFusionMutator::mutate(Graph &g) { // Supported activations when fusing - const std::set supported_fused_activations = { Activation::ABS, Activation::BOUNDED_RELU, Activation::ELU, - Activation::HARD_SWISH, Activation::IDENTITY, Activation::LEAKY_RELU, - Activation::LINEAR, Activation::LOGISTIC, Activation::LU_BOUNDED_RELU, - Activation::RELU, Activation::SOFT_RELU, Activation::SQRT, - Activation::SQUARE, Activation::TANH - }; + const std::set supported_fused_activations = { + Activation::ABS, Activation::BOUNDED_RELU, Activation::ELU, + Activation::HARD_SWISH, Activation::IDENTITY, Activation::LEAKY_RELU, + Activation::LINEAR, Activation::LOGISTIC, Activation::LU_BOUNDED_RELU, + Activation::RELU, Activation::SOFT_RELU, Activation::SQRT, + Activation::SQUARE, Activation::TANH}; // Preconditions - auto empty_prec = [](INode &) - { - return true; - }; - auto cl_target_prec = [](INode & n) - { - return n.assigned_target() == Target::CL; - }; - auto qs8_prec = [&g](INode & n) + auto empty_prec = [](INode &) { return true; }; + auto cl_target_prec = [](INode &n) { return n.assigned_target() == Target::CL; }; + auto qs8_prec = [&g](INode &n) { ARM_COMPUTE_ERROR_ON(n.output(0) == nullptr); const auto output_edge_id = *n.output_edges().begin(); const auto output_edge = g.edge(output_edge_id); // To perform fusion the two nodes must have same output quantization information - const bool same_qinfo = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info; + const bool same_qinfo = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info; const bool output_qasymm8 = n.output(0)->desc().data_type == DataType::QASYMM8; return (output_qasymm8 && same_qinfo) || !output_qasymm8; @@ -400,16 +404,25 @@ void NodeFusionMutator::mutate(Graph &g) // Fusion mutations - detail::fuse_layer(g, empty_prec, detail::fuse_pad_with_convolution); - detail::fuse_layer(g, empty_prec, detail::fuse_pad_with_convolution); - detail::fuse_layer(g, empty_prec, detail::fuse_node_with_activation, supported_fused_activations); - detail::fuse_layer(g, empty_prec, detail::fuse_node_with_activation, supported_fused_activations); - detail::fuse_layer(g, qs8_prec, detail::fuse_node_with_activation, supported_fused_activations); - detail::fuse_layer(g, empty_prec, detail::fuse_node_with_activation, supported_fused_activations); - detail::fuse_layer(g, cl_target_prec, detail::fuse_node_with_activation, supported_fused_activations); + detail::fuse_layer(g, empty_prec, + detail::fuse_pad_with_convolution); + detail::fuse_layer( + g, empty_prec, detail::fuse_pad_with_convolution); + detail::fuse_layer( + g, empty_prec, detail::fuse_node_with_activation, supported_fused_activations); + detail::fuse_layer( + g, empty_prec, detail::fuse_node_with_activation, supported_fused_activations); + detail::fuse_layer( + g, qs8_prec, detail::fuse_node_with_activation, supported_fused_activations); + detail::fuse_layer( + g, empty_prec, detail::fuse_node_with_activation, supported_fused_activations); + detail::fuse_layer( + g, cl_target_prec, detail::fuse_node_with_activation, supported_fused_activations); // The fusion of BatchNormalizationLayer must occur after the fusion of ActivationLayer. Because FusedConvolutionBatchNormalizationNode assumes the BatchNormalization is already fused with activation, if any - detail::fuse_layer(g, empty_prec, detail::fuse_convolution_with_batch_normalization); - detail::fuse_layer(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization); + detail::fuse_layer( + g, empty_prec, detail::fuse_convolution_with_batch_normalization); + detail::fuse_layer( + g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization); } } // namespace graph } // namespace arm_compute diff --git a/src/graph/mutators/SplitLayerSubTensorMutator.cpp b/src/graph/mutators/SplitLayerSubTensorMutator.cpp index 2c28a1a2d1..533f8944cf 100644 --- a/src/graph/mutators/SplitLayerSubTensorMutator.cpp +++ b/src/graph/mutators/SplitLayerSubTensorMutator.cpp @@ -23,12 +23,12 @@ */ #include "arm_compute/graph/mutators/SplitLayerSubTensorMutator.h" -#include "arm_compute/graph/Graph.h" -#include "arm_compute/graph/Logger.h" -#include "arm_compute/graph/Utils.h" #include "arm_compute/graph/algorithms/TopologicalSort.h" #include "arm_compute/graph/backends/BackendRegistry.h" +#include "arm_compute/graph/Graph.h" +#include "arm_compute/graph/Logger.h" #include "arm_compute/graph/nodes/SplitLayerNode.h" +#include "arm_compute/graph/Utils.h" #include "support/Cast.h" #include "support/Iterable.h" @@ -50,7 +50,7 @@ IGraphMutator::MutationType SplitLayerSubTensorMutator::type() const void SplitLayerSubTensorMutator::mutate(Graph &g) { // Early exit if no Split layers exist in graph - if(g.nodes(NodeType::SplitLayer).empty()) + if (g.nodes(NodeType::SplitLayer).empty()) { return; } @@ -59,23 +59,23 @@ void SplitLayerSubTensorMutator::mutate(Graph &g) std::vector topological_sorted_node_ids = dfs(g); // Should be in reverse order of execution - for(auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids)) + for (auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids)) { INode *node = g.node(node_id); - if(node != nullptr && node->type() == NodeType::SplitLayer && node->input(0) != nullptr) + if (node != nullptr && node->type() == NodeType::SplitLayer && node->input(0) != nullptr) { // Get output tensor Tensor *input_tensor = node->input(0); // Check that all tensor have the same target and are valid bool is_valid = std::all_of(node->outputs().cbegin(), node->outputs().cend(), - [&](const TensorID & tid) - { - return (g.tensor(tid) != nullptr) && (g.tensor(tid)->desc().target == input_tensor->desc().target); - }); + [&](const TensorID &tid) { + return (g.tensor(tid) != nullptr) && + (g.tensor(tid)->desc().target == input_tensor->desc().target); + }); // Create subtensors - if(is_valid && is_target_supported(input_tensor->desc().target)) + if (is_valid && is_target_supported(input_tensor->desc().target)) { ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : " << node->id() << " and name : " << node->name() << std::endl); @@ -87,15 +87,18 @@ void SplitLayerSubTensorMutator::mutate(Graph &g) const bool extend_parent = (axis < 2); // Create sub-tensor handles - for(unsigned int i = 0; i < node->outputs().size(); ++i) + for (unsigned int i = 0; i < node->outputs().size(); ++i) { Tensor *output_tensor = node->output(i); const TensorShape output_shape = output_tensor->desc().shape; Coordinates coords; - std::tie(std::ignore, coords) = split_node->compute_output_descriptor(input_tensor->desc(), num_splits, axis, i); + std::tie(std::ignore, coords) = + split_node->compute_output_descriptor(input_tensor->desc(), num_splits, axis, i); - backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(output_tensor->desc().target); - std::unique_ptr handle = backend.create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent); + backends::IDeviceBackend &backend = + backends::BackendRegistry::get().get_backend(output_tensor->desc().target); + std::unique_ptr handle = + backend.create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent); output_tensor->set_handle(std::move(handle)); } } diff --git a/src/graph/mutators/SyntheticDataTypeMutator.cpp b/src/graph/mutators/SyntheticDataTypeMutator.cpp index 74d040b81d..3dc2480e85 100644 --- a/src/graph/mutators/SyntheticDataTypeMutator.cpp +++ b/src/graph/mutators/SyntheticDataTypeMutator.cpp @@ -26,8 +26,8 @@ #include "arm_compute/graph/GraphBuilder.h" #include "arm_compute/graph/ITensorAccessor.h" #include "arm_compute/graph/Logger.h" -#include "arm_compute/graph/Utils.h" #include "arm_compute/graph/nodes/Nodes.h" +#include "arm_compute/graph/Utils.h" #include "support/Cast.h" @@ -62,14 +62,12 @@ public: */ bool is_mutation_supported(Graph &g) { - const std::set unsupported_node_types = { NodeType::DetectionOutputLayer, - NodeType::NormalizationLayer, - NodeType::PriorBoxLayer - }; + const std::set unsupported_node_types = {NodeType::DetectionOutputLayer, NodeType::NormalizationLayer, + NodeType::PriorBoxLayer}; - for(const auto &utype : unsupported_node_types) + for (const auto &utype : unsupported_node_types) { - if(!g.nodes(utype).empty()) + if (!g.nodes(utype).empty()) { return false; } @@ -83,12 +81,12 @@ bool is_mutation_supported(Graph &g) */ void remove_optimized_nodes(Graph &g) { - const std::set optimized_node_types = { NodeType::BatchNormalizationLayer }; + const std::set optimized_node_types = {NodeType::BatchNormalizationLayer}; - for(const auto &opt_type : optimized_node_types) + for (const auto &opt_type : optimized_node_types) { const std::vector opt_nodes_ids = g.nodes(opt_type); - for(const auto &node_id : opt_nodes_ids) + for (const auto &node_id : opt_nodes_ids) { INode *node = g.node(node_id); @@ -108,7 +106,7 @@ void remove_optimized_nodes(Graph &g) g.remove_node(node->id()); // Update connections - for(auto &driving_node : driving_nodes) + for (auto &driving_node : driving_nodes) { g.add_connection(producer->id(), producer_edge_id, driving_node.node_id, driving_node.index); } @@ -123,11 +121,11 @@ void remove_optimized_nodes(Graph &g) void convert_tensors(Graph &g, DataType data_type) { auto &tensors = g.tensors(); - for(auto &tensor : tensors) + for (auto &tensor : tensors) { - if(tensor != nullptr) + if (tensor != nullptr) { - switch(data_type) + switch (data_type) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: @@ -156,7 +154,7 @@ template void convert_special_node(Graph &g, std::function const &f) { const std::vector nodes_ids = g.nodes(NT::node_type); - for(const auto &nodes_id : nodes_ids) + for (const auto &nodes_id : nodes_ids) { INode *node = arm_compute::utils::cast::polymorphic_downcast(g.node(nodes_id)); ARM_COMPUTE_ERROR_ON(node == nullptr); @@ -174,41 +172,41 @@ void convert_special_node(Graph &g, std::function const */ void convert_special_tensors(Graph &g) { - auto softmax_func = [](INode * node, Tensor * tensor) + auto softmax_func = [](INode *node, Tensor *tensor) { ARM_COMPUTE_UNUSED(node); - if(tensor->desc().data_type == DataType::QASYMM8) + if (tensor->desc().data_type == DataType::QASYMM8) { tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0); } - else if(tensor->desc().data_type == DataType::QASYMM8_SIGNED) + else if (tensor->desc().data_type == DataType::QASYMM8_SIGNED) { tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, -128); } return true; }; - auto act_func = [](INode * node, Tensor * tensor) + auto act_func = [](INode *node, Tensor *tensor) { auto *act_node = arm_compute::utils::cast::polymorphic_downcast(node); - if(tensor->desc().data_type == DataType::QASYMM8) + if (tensor->desc().data_type == DataType::QASYMM8) { - if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH) + if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH) { tensor->desc().quant_info = QuantizationInfo(1.f / 128.f, 128); } - else if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC) + else if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC) { tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0); } } - else if(tensor->desc().data_type == DataType::QASYMM8_SIGNED) + else if (tensor->desc().data_type == DataType::QASYMM8_SIGNED) { - if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH) + if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH) { tensor->desc().quant_info = QuantizationInfo(1.f / 128.f, 0); } - else if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC) + else if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC) { tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, -128); } @@ -228,22 +226,19 @@ void convert_special_tensors(Graph &g) */ void handle_nodes_with_bias(Graph &g) { - const std::set special_node_types = { NodeType::ConvolutionLayer, - NodeType::DeconvolutionLayer, - NodeType::DepthwiseConvolutionLayer, - NodeType::FullyConnectedLayer - }; + const std::set special_node_types = {NodeType::ConvolutionLayer, NodeType::DeconvolutionLayer, + NodeType::DepthwiseConvolutionLayer, NodeType::FullyConnectedLayer}; - for(const auto &spc_type : special_node_types) + for (const auto &spc_type : special_node_types) { const std::vector scp_nodes_ids = g.nodes(spc_type); - for(const auto &node_id : scp_nodes_ids) + for (const auto &node_id : scp_nodes_ids) { INode *node = g.node(node_id); - if(node != nullptr) + if (node != nullptr) { Tensor *tensor = node->input(2); - if(tensor != nullptr) + if (tensor != nullptr) { tensor->desc().data_type = DataType::S32; } @@ -253,8 +248,8 @@ void handle_nodes_with_bias(Graph &g) params.name = params.name.empty() ? "" : params.name + "Bias"; TensorDescriptor b_desc = node->input(1)->desc(); - auto depth = b_desc.shape[get_dimension_idx(b_desc.layout, DataLayoutDimension::BATCHES)]; - b_desc.shape = TensorShape(depth); + auto depth = b_desc.shape[get_dimension_idx(b_desc.layout, DataLayoutDimension::BATCHES)]; + b_desc.shape = TensorShape(depth); auto accessor = std::make_unique(); auto b_nid = GraphBuilder::add_const_node(g, params, b_desc, std::move(accessor)); @@ -266,8 +261,7 @@ void handle_nodes_with_bias(Graph &g) } } // namespace -SyntheticDataTypeMutator::SyntheticDataTypeMutator(DataType mutate_type) - : _mutate_type{ mutate_type } +SyntheticDataTypeMutator::SyntheticDataTypeMutator(DataType mutate_type) : _mutate_type{mutate_type} { } @@ -283,7 +277,7 @@ IGraphMutator::MutationType SyntheticDataTypeMutator::type() const void SyntheticDataTypeMutator::mutate(Graph &g) { - if(is_mutation_supported(g)) + if (is_mutation_supported(g)) { // Remove nodes that get optimized out (e.g. BatchNorm) remove_optimized_nodes(g); diff --git a/src/graph/nodes/ActivationLayerNode.cpp b/src/graph/nodes/ActivationLayerNode.cpp index cf65d83a5e..1773afcb16 100644 --- a/src/graph/nodes/ActivationLayerNode.cpp +++ b/src/graph/nodes/ActivationLayerNode.cpp @@ -44,7 +44,7 @@ ActivationLayerInfo ActivationLayerNode::activation_info() const bool ActivationLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -63,7 +63,7 @@ TensorDescriptor ActivationLayerNode::configure_output(size_t idx) const ARM_COMPUTE_ERROR_ON(src == nullptr); TensorDescriptor output_info = src->desc(); - if(!_out_quant_info.empty()) + if (!_out_quant_info.empty()) { output_info.quant_info = _out_quant_info; } diff --git a/src/graph/nodes/ArgMinMaxLayerNode.cpp b/src/graph/nodes/ArgMinMaxLayerNode.cpp index 63163b9e2c..5adebc950a 100644 --- a/src/graph/nodes/ArgMinMaxLayerNode.cpp +++ b/src/graph/nodes/ArgMinMaxLayerNode.cpp @@ -23,16 +23,18 @@ */ #include "arm_compute/graph/nodes/ArgMinMaxLayerNode.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/INodeVisitor.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - namespace arm_compute { namespace graph { -ArgMinMaxLayerNode::ArgMinMaxLayerNode(ReductionOperation op, unsigned int axis, DataType out_data_type, QuantizationInfo out_quant_info) +ArgMinMaxLayerNode::ArgMinMaxLayerNode(ReductionOperation op, + unsigned int axis, + DataType out_data_type, + QuantizationInfo out_quant_info) : _op(op), _axis(axis), _out_data_type(out_data_type), _out_quant_info(std::move(out_quant_info)) { _input_edges.resize(1, EmptyEdgeID); @@ -56,7 +58,7 @@ DataType ArgMinMaxLayerNode::out_data_type() const bool ArgMinMaxLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -75,17 +77,18 @@ TensorDescriptor ArgMinMaxLayerNode::configure_output(size_t idx) const ARM_COMPUTE_ERROR_ON(src == nullptr); TensorDescriptor output_info = src->desc(); - if(!_out_quant_info.empty()) + if (!_out_quant_info.empty()) { output_info.quant_info = _out_quant_info; } - if(_out_data_type != DataType::UNKNOWN) + if (_out_data_type != DataType::UNKNOWN) { output_info.data_type = _out_data_type; } - TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, false); + TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, false); output_info.set_shape(output_shape); return output_info; diff --git a/src/graph/nodes/BatchNormalizationLayerNode.cpp b/src/graph/nodes/BatchNormalizationLayerNode.cpp index ceca0e2715..c317123e8d 100644 --- a/src/graph/nodes/BatchNormalizationLayerNode.cpp +++ b/src/graph/nodes/BatchNormalizationLayerNode.cpp @@ -55,7 +55,7 @@ void BatchNormalizationLayerNode::set_fused_activation(ActivationLayerInfo fused bool BatchNormalizationLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -86,4 +86,4 @@ void BatchNormalizationLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp index f3f4f91075..8e52174639 100644 --- a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp +++ b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp @@ -23,17 +23,15 @@ */ #include "arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h" +#include "arm_compute/core/Helpers.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/INodeVisitor.h" -#include "arm_compute/core/Helpers.h" - namespace arm_compute { namespace graph { -BoundingBoxTransformLayerNode::BoundingBoxTransformLayerNode(BoundingBoxTransformInfo &info) - : _bbox_info(info) +BoundingBoxTransformLayerNode::BoundingBoxTransformLayerNode(BoundingBoxTransformInfo &info) : _bbox_info(info) { _input_edges.resize(2, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -46,7 +44,7 @@ const BoundingBoxTransformInfo &BoundingBoxTransformLayerNode::info() const bool BoundingBoxTransformLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); diff --git a/src/graph/nodes/ChannelShuffleLayerNode.cpp b/src/graph/nodes/ChannelShuffleLayerNode.cpp index 5102e4b6da..3cb9e23eca 100644 --- a/src/graph/nodes/ChannelShuffleLayerNode.cpp +++ b/src/graph/nodes/ChannelShuffleLayerNode.cpp @@ -30,8 +30,7 @@ namespace arm_compute { namespace graph { -ChannelShuffleLayerNode::ChannelShuffleLayerNode(unsigned int num_groups) - : _num_groups(num_groups) +ChannelShuffleLayerNode::ChannelShuffleLayerNode(unsigned int num_groups) : _num_groups(num_groups) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -44,7 +43,7 @@ unsigned int ChannelShuffleLayerNode::num_groups() const bool ChannelShuffleLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -75,4 +74,4 @@ void ChannelShuffleLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/ConcatenateLayerNode.cpp b/src/graph/nodes/ConcatenateLayerNode.cpp index 3f3c70f3bb..8e5393a5e4 100644 --- a/src/graph/nodes/ConcatenateLayerNode.cpp +++ b/src/graph/nodes/ConcatenateLayerNode.cpp @@ -24,17 +24,17 @@ #include "arm_compute/graph/nodes/ConcatenateLayerNode.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/INodeVisitor.h" #include "arm_compute/graph/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - namespace arm_compute { namespace graph { -ConcatenateLayerNode::ConcatenateLayerNode(unsigned int total_nodes, descriptors::ConcatLayerDescriptor concat_descriptor) +ConcatenateLayerNode::ConcatenateLayerNode(unsigned int total_nodes, + descriptors::ConcatLayerDescriptor concat_descriptor) : _total_nodes(total_nodes), _concat_descriptor(std::move(concat_descriptor)), _is_enabled(true) { _input_edges.resize(_total_nodes, EmptyEdgeID); @@ -73,7 +73,7 @@ TensorDescriptor ConcatenateLayerNode::compute_output_descriptor(const std::vect // Extract shapes std::vector shapes; shapes.reserve(input_descriptors.size()); - for(auto &input_descriptor : input_descriptors) + for (auto &input_descriptor : input_descriptors) { shapes.emplace_back(&input_descriptor.shape); } @@ -85,7 +85,7 @@ TensorDescriptor ConcatenateLayerNode::compute_output_descriptor(const std::vect bool ConcatenateLayerNode::forward_descriptors() { - if(_outputs[0] != NullTensorID) + if (_outputs[0] != NullTensorID) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -101,24 +101,22 @@ TensorDescriptor ConcatenateLayerNode::configure_output(size_t idx) const ARM_COMPUTE_ERROR_ON(idx >= _outputs.size()); // Check if all input tensors are set - bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid) - { - return eid != EmptyEdgeID; - }); + bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), + [](const EdgeID &eid) { return eid != EmptyEdgeID; }); TensorDescriptor output_info = {}; - if(are_all_inputs_set) + if (are_all_inputs_set) { std::vector inputs_descriptors; - for(unsigned int i = 0; i < _input_edges.size(); ++i) + for (unsigned int i = 0; i < _input_edges.size(); ++i) { const Tensor *t = _graph->tensor(input_id(i)); ARM_COMPUTE_ERROR_ON(t == nullptr); inputs_descriptors.push_back(t->desc()); } output_info = compute_output_descriptor(inputs_descriptors, _concat_descriptor.axis); - if(!_concat_descriptor.output_qinfo.empty()) + if (!_concat_descriptor.output_qinfo.empty()) { output_info.quant_info = _concat_descriptor.output_qinfo; } diff --git a/src/graph/nodes/ConstNode.cpp b/src/graph/nodes/ConstNode.cpp index eb96d63888..6e8fbff71a 100644 --- a/src/graph/nodes/ConstNode.cpp +++ b/src/graph/nodes/ConstNode.cpp @@ -30,15 +30,14 @@ namespace arm_compute { namespace graph { -ConstNode::ConstNode(TensorDescriptor desc) - : _desc(std::move(desc)) +ConstNode::ConstNode(TensorDescriptor desc) : _desc(std::move(desc)) { _outputs.resize(1, NullTensorID); } bool ConstNode::forward_descriptors() { - if(output_id(0) != NullTensorID) + if (output_id(0) != NullTensorID) { Tensor *t = output(0); ARM_COMPUTE_ERROR_ON(t == nullptr); diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp index ee9dde91d5..f0263fc84a 100644 --- a/src/graph/nodes/ConvolutionLayerNode.cpp +++ b/src/graph/nodes/ConvolutionLayerNode.cpp @@ -37,7 +37,12 @@ ConvolutionLayerNode::ConvolutionLayerNode(PadStrideInfo info, ConvolutionMethod method, FastMathHint fast_math_hint, QuantizationInfo out_quant_info) - : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(std::move(out_quant_info)), _fused_activation() + : _info(std::move(info)), + _num_groups(num_groups), + _method(method), + _fast_math_hint(fast_math_hint), + _out_quant_info(std::move(out_quant_info)), + _fused_activation() { _input_edges.resize(3, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -100,20 +105,22 @@ TensorDescriptor ConvolutionLayerNode::compute_output_descriptor(const TensorDes const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH); const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT); - std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info); + std::tie(output_width, output_height) = + scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info); const DataLayout data_layout = input_descriptor.layout; TensorDescriptor output_descriptor = input_descriptor; output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width); output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height); - output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]); + output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), + weights_descriptor.shape[3]); return output_descriptor; } bool ConvolutionLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -132,7 +139,7 @@ TensorDescriptor ConvolutionLayerNode::configure_output(size_t idx) const ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr); TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info); - if(!_out_quant_info.empty()) + if (!_out_quant_info.empty()) { output_info.quant_info = _out_quant_info; } diff --git a/src/graph/nodes/DeconvolutionLayerNode.cpp b/src/graph/nodes/DeconvolutionLayerNode.cpp index 3542d5ad10..2058ab21e5 100644 --- a/src/graph/nodes/DeconvolutionLayerNode.cpp +++ b/src/graph/nodes/DeconvolutionLayerNode.cpp @@ -56,20 +56,22 @@ TensorDescriptor DeconvolutionLayerNode::compute_output_descriptor(const TensorD const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH); const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT); - std::tie(output_width, output_height) = deconvolution_output_dimensions(input_width, input_height, kernel_width, kernel_height, info); + std::tie(output_width, output_height) = + deconvolution_output_dimensions(input_width, input_height, kernel_width, kernel_height, info); const DataLayout data_layout = input_descriptor.layout; TensorDescriptor output_descriptor = input_descriptor; output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width); output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height); - output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]); + output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), + weights_descriptor.shape[3]); return output_descriptor; } bool DeconvolutionLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -89,7 +91,7 @@ TensorDescriptor DeconvolutionLayerNode::configure_output(size_t idx) const TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), descriptor.info); - if(!descriptor.out_quant_info.empty()) + if (!descriptor.out_quant_info.empty()) { output_info.set_quantization_info(descriptor.out_quant_info); } diff --git a/src/graph/nodes/DepthToSpaceLayerNode.cpp b/src/graph/nodes/DepthToSpaceLayerNode.cpp index b70ac56a07..0b914a0e56 100644 --- a/src/graph/nodes/DepthToSpaceLayerNode.cpp +++ b/src/graph/nodes/DepthToSpaceLayerNode.cpp @@ -32,8 +32,7 @@ namespace arm_compute { namespace graph { -DepthToSpaceLayerNode::DepthToSpaceLayerNode(int block_shape) - : _block_shape(block_shape) +DepthToSpaceLayerNode::DepthToSpaceLayerNode(int block_shape) : _block_shape(block_shape) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -44,7 +43,8 @@ int DepthToSpaceLayerNode::block_shape() const return _block_shape; } -TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, int block_shape) +TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, + int block_shape) { using namespace arm_compute::helpers::tensor_transform; @@ -53,14 +53,15 @@ TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDe // Set descriptor shape TensorDescriptor output_descriptor = input_descriptor; - output_descriptor.shape = misc::shape_calculator::compute_depth_to_space_shape(input_shape, data_layout, block_shape); + output_descriptor.shape = + misc::shape_calculator::compute_depth_to_space_shape(input_shape, data_layout, block_shape); return output_descriptor; } bool DepthToSpaceLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp index 7de20165cb..92d7266088 100644 --- a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp +++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp @@ -32,9 +32,15 @@ namespace arm_compute { namespace graph { -DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, int depth_multiplier, DepthwiseConvolutionMethod method, - QuantizationInfo out_quant_info) - : _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _out_quant_info(std::move(out_quant_info)), _fused_activation() +DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, + int depth_multiplier, + DepthwiseConvolutionMethod method, + QuantizationInfo out_quant_info) + : _info(std::move(info)), + _depth_multiplier(depth_multiplier), + _method(method), + _out_quant_info(std::move(out_quant_info)), + _fused_activation() { _input_edges.resize(3, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -89,20 +95,22 @@ TensorDescriptor DepthwiseConvolutionLayerNode::compute_output_descriptor(const const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH); const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT); - std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info); + std::tie(output_width, output_height) = + scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info); const DataLayout data_layout = input_descriptor.layout; TensorDescriptor output_descriptor = input_descriptor; output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width); output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height); - output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier); + output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), + input_channels * depth_multiplier); return output_descriptor; } bool DepthwiseConvolutionLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -121,7 +129,7 @@ TensorDescriptor DepthwiseConvolutionLayerNode::configure_output(size_t idx) con ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr); TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info, _depth_multiplier); - if(!_out_quant_info.empty()) + if (!_out_quant_info.empty()) { output_info.quant_info = _out_quant_info; } @@ -139,4 +147,4 @@ void DepthwiseConvolutionLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/DequantizationLayerNode.cpp b/src/graph/nodes/DequantizationLayerNode.cpp index 14c4752f12..3ea000852a 100644 --- a/src/graph/nodes/DequantizationLayerNode.cpp +++ b/src/graph/nodes/DequantizationLayerNode.cpp @@ -40,7 +40,7 @@ DequantizationLayerNode::DequantizationLayerNode() bool DequantizationLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -74,4 +74,4 @@ void DequantizationLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/DetectionOutputLayerNode.cpp b/src/graph/nodes/DetectionOutputLayerNode.cpp index fc6f531ee0..65ddd2f5bc 100644 --- a/src/graph/nodes/DetectionOutputLayerNode.cpp +++ b/src/graph/nodes/DetectionOutputLayerNode.cpp @@ -32,8 +32,7 @@ namespace arm_compute { namespace graph { -DetectionOutputLayerNode::DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info) - : _info(detection_info) +DetectionOutputLayerNode::DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info) : _info(detection_info) { _input_edges.resize(3, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -47,7 +46,8 @@ DetectionOutputLayerInfo DetectionOutputLayerNode::detection_output_info() const TensorDescriptor DetectionOutputLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, const DetectionOutputLayerInfo &info) { - const unsigned int max_size = info.keep_top_k() * ((input_descriptor.shape.num_dimensions() > 1) ? input_descriptor.shape[1] : 1); + const unsigned int max_size = + info.keep_top_k() * ((input_descriptor.shape.num_dimensions() > 1) ? input_descriptor.shape[1] : 1); TensorDescriptor output_descriptor = input_descriptor; output_descriptor.shape.set(0, detection_size); @@ -58,7 +58,8 @@ TensorDescriptor DetectionOutputLayerNode::compute_output_descriptor(const Tenso bool DetectionOutputLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && + (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); diff --git a/src/graph/nodes/DetectionPostProcessLayerNode.cpp b/src/graph/nodes/DetectionPostProcessLayerNode.cpp index 2c5005af30..af3fc03d67 100644 --- a/src/graph/nodes/DetectionPostProcessLayerNode.cpp +++ b/src/graph/nodes/DetectionPostProcessLayerNode.cpp @@ -46,10 +46,11 @@ DetectionPostProcessLayerInfo DetectionPostProcessLayerNode::detection_post_proc bool DetectionPostProcessLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID) - && (output_id(2) != NullTensorID) && (output_id(3) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && + (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID) && (output_id(2) != NullTensorID) && + (output_id(3) != NullTensorID)) { - for(unsigned int i = 0; i < 4; ++i) + for (unsigned int i = 0; i < 4; ++i) { Tensor *dst = output(i); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -68,7 +69,7 @@ TensorDescriptor DetectionPostProcessLayerNode::configure_output(size_t idx) con TensorDescriptor output_desc; const unsigned int num_detected_box = _info.max_detections() * _info.max_classes_per_detection(); - switch(idx) + switch (idx) { case 0: // Configure boxes output @@ -101,4 +102,4 @@ void DetectionPostProcessLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/DummyNode.cpp b/src/graph/nodes/DummyNode.cpp index 6fa9fbaf56..b5f37bd79b 100644 --- a/src/graph/nodes/DummyNode.cpp +++ b/src/graph/nodes/DummyNode.cpp @@ -32,8 +32,7 @@ namespace arm_compute { namespace graph { -DummyNode::DummyNode(TensorShape shape) - : _shape(shape) +DummyNode::DummyNode(TensorShape shape) : _shape(shape) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -41,7 +40,7 @@ DummyNode::DummyNode(TensorShape shape) bool DummyNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -75,4 +74,4 @@ void DummyNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/EltwiseLayerNode.cpp b/src/graph/nodes/EltwiseLayerNode.cpp index 4426e953ee..3f7a08e64d 100644 --- a/src/graph/nodes/EltwiseLayerNode.cpp +++ b/src/graph/nodes/EltwiseLayerNode.cpp @@ -31,8 +31,7 @@ namespace arm_compute { namespace graph { -EltwiseLayerNode::EltwiseLayerNode(const descriptors::EltwiseLayerDescriptor &descriptor) - : descriptor(descriptor) +EltwiseLayerNode::EltwiseLayerNode(const descriptors::EltwiseLayerDescriptor &descriptor) : descriptor(descriptor) { _input_edges.resize(2, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -70,7 +69,7 @@ void EltwiseLayerNode::set_fused_activation(ActivationLayerInfo fused_activation bool EltwiseLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -97,7 +96,7 @@ TensorDescriptor EltwiseLayerNode::configure_output(size_t idx) const output_info.set_shape(out_shape); - if(!descriptor.out_quant_info.empty()) + if (!descriptor.out_quant_info.empty()) { output_info.set_quantization_info(descriptor.out_quant_info); } @@ -134,7 +133,7 @@ void UnaryEltwiseLayerNode::set_fused_activation(ActivationLayerInfo fused_activ bool UnaryEltwiseLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -153,7 +152,7 @@ TensorDescriptor UnaryEltwiseLayerNode::configure_output(size_t idx) const auto output_info = src->desc(); - if(!descriptor.out_quant_info.empty()) + if (!descriptor.out_quant_info.empty()) { output_info.set_quantization_info(descriptor.out_quant_info); } diff --git a/src/graph/nodes/FlattenLayerNode.cpp b/src/graph/nodes/FlattenLayerNode.cpp index 48519a1695..952df2f3ec 100644 --- a/src/graph/nodes/FlattenLayerNode.cpp +++ b/src/graph/nodes/FlattenLayerNode.cpp @@ -38,7 +38,7 @@ FlattenLayerNode::FlattenLayerNode() bool FlattenLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -72,4 +72,4 @@ void FlattenLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp index 6278227878..1eed69ddaf 100644 --- a/src/graph/nodes/FullyConnectedLayer.cpp +++ b/src/graph/nodes/FullyConnectedLayer.cpp @@ -21,18 +21,23 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/graph/nodes/FullyConnectedLayerNode.h" - #include "arm_compute/core/Utils.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/INodeVisitor.h" +#include "arm_compute/graph/nodes/FullyConnectedLayerNode.h" namespace arm_compute { namespace graph { -FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int num_outputs, QuantizationInfo out_quant_info, FullyConnectedLayerInfo fc_info, FastMathHint fast_math_hint) - : _num_outputs(num_outputs), _out_quant_info(std::move(out_quant_info)), _info(fc_info), _fast_math_hint(fast_math_hint) +FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int num_outputs, + QuantizationInfo out_quant_info, + FullyConnectedLayerInfo fc_info, + FastMathHint fast_math_hint) + : _num_outputs(num_outputs), + _out_quant_info(std::move(out_quant_info)), + _info(fc_info), + _fast_math_hint(fast_math_hint) { _input_edges.resize(3, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -60,11 +65,11 @@ TensorDescriptor FullyConnectedLayerNode::compute_weights_descriptor(const Tenso unsigned int num_weights = 1; unsigned int num_dimensions = input_descriptor.shape.num_dimensions(); // Ignore the batch dimension if there is one: - if(num_dimensions == 2 || num_dimensions == 4) + if (num_dimensions == 2 || num_dimensions == 4) { num_dimensions--; } - for(unsigned int i = 0; i < num_dimensions; i++) + for (unsigned int i = 0; i < num_dimensions; i++) { num_weights *= input_descriptor.shape[i]; } @@ -73,13 +78,13 @@ TensorDescriptor FullyConnectedLayerNode::compute_weights_descriptor(const Tenso weights_descriptor.shape = TensorShape(num_weights, num_outputs); // If weights are tranposed, use tranposed shape - if(!fc_info.transpose_weights) + if (!fc_info.transpose_weights) { weights_descriptor.shape = TensorShape(num_outputs, num_weights); } // Set quantization info if present - if(!weights_quant_info.empty()) + if (!weights_quant_info.empty()) { weights_descriptor.quant_info = weights_quant_info; } @@ -93,7 +98,7 @@ TensorDescriptor FullyConnectedLayerNode::compute_output_descriptor(const Tensor { // Note: Only 1D batch space is supported at the moment unsigned int batches = input_descriptor.shape[1]; - if(input_descriptor.shape.num_dimensions() > 2) + if (input_descriptor.shape.num_dimensions() > 2) { batches = input_descriptor.shape[3]; } @@ -103,7 +108,7 @@ TensorDescriptor FullyConnectedLayerNode::compute_output_descriptor(const Tensor output_descriptor.shape = TensorShape(num_outputs, batches); // Set quantization info if present - if(!out_quant_info.empty()) + if (!out_quant_info.empty()) { output_descriptor.quant_info = out_quant_info; } @@ -118,7 +123,7 @@ FullyConnectedLayerInfo FullyConnectedLayerNode::info() const bool FullyConnectedLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -147,4 +152,4 @@ void FullyConnectedLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp index de995ebee9..9d37e84acf 100644 --- a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp +++ b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp @@ -32,12 +32,18 @@ namespace arm_compute { namespace graph { -FusedConvolutionBatchNormalizationNode::FusedConvolutionBatchNormalizationNode(float epsilon, PadStrideInfo info, - unsigned int num_groups, - ConvolutionMethod method, - FastMathHint fast_math_hint, +FusedConvolutionBatchNormalizationNode::FusedConvolutionBatchNormalizationNode(float epsilon, + PadStrideInfo info, + unsigned int num_groups, + ConvolutionMethod method, + FastMathHint fast_math_hint, ActivationLayerInfo fused_activation) - : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _fused_activation(fused_activation) + : _epsilon(epsilon), + _info(std::move(info)), + _num_groups(num_groups), + _method(method), + _fast_math_hint(fast_math_hint), + _fused_activation(fused_activation) { _input_edges.resize(7, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -88,9 +94,8 @@ void FusedConvolutionBatchNormalizationNode::set_fused_activation(ActivationLaye _fused_activation = fused_activation; } -TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, - const TensorDescriptor &weights_descriptor, - const PadStrideInfo &info) +TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descriptor( + const TensorDescriptor &input_descriptor, const TensorDescriptor &weights_descriptor, const PadStrideInfo &info) { unsigned int output_width = 0; unsigned int output_height = 0; @@ -100,20 +105,22 @@ TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descript const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH); const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT); - std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info); + std::tie(output_width, output_height) = + scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info); const DataLayout data_layout = input_descriptor.layout; TensorDescriptor output_descriptor = input_descriptor; output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width); output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height); - output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]); + output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), + weights_descriptor.shape[3]); return output_descriptor; } bool FusedConvolutionBatchNormalizationNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); diff --git a/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp index c022450b9d..c51641d64c 100644 --- a/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp +++ b/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp @@ -32,18 +32,24 @@ namespace arm_compute { namespace graph { -FusedDepthwiseConvolutionBatchNormalizationNode::FusedDepthwiseConvolutionBatchNormalizationNode(float epsilon, - PadStrideInfo info, - unsigned int depth_multiplier, - DepthwiseConvolutionMethod method, - ActivationLayerInfo fused_activation) - : _epsilon(epsilon), _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _fused_activation(fused_activation) +FusedDepthwiseConvolutionBatchNormalizationNode::FusedDepthwiseConvolutionBatchNormalizationNode( + float epsilon, + PadStrideInfo info, + unsigned int depth_multiplier, + DepthwiseConvolutionMethod method, + ActivationLayerInfo fused_activation) + : _epsilon(epsilon), + _info(std::move(info)), + _depth_multiplier(depth_multiplier), + _method(method), + _fused_activation(fused_activation) { _input_edges.resize(7, EmptyEdgeID); _outputs.resize(1, NullTensorID); } -void FusedDepthwiseConvolutionBatchNormalizationNode::set_depthwise_convolution_method(DepthwiseConvolutionMethod method) +void FusedDepthwiseConvolutionBatchNormalizationNode::set_depthwise_convolution_method( + DepthwiseConvolutionMethod method) { _method = method; } @@ -78,10 +84,11 @@ void FusedDepthwiseConvolutionBatchNormalizationNode::set_fused_activation(Activ _fused_activation = fused_activation; } -TensorDescriptor FusedDepthwiseConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, - const TensorDescriptor &weights_descriptor, - const PadStrideInfo &info, - int depth_multiplier) +TensorDescriptor +FusedDepthwiseConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, + const TensorDescriptor &weights_descriptor, + const PadStrideInfo &info, + int depth_multiplier) { unsigned int output_width = 0; unsigned int output_height = 0; @@ -92,19 +99,22 @@ TensorDescriptor FusedDepthwiseConvolutionBatchNormalizationNode::compute_output const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH); const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT); - std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info); + std::tie(output_width, output_height) = + scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info); TensorDescriptor output_descriptor = input_descriptor; output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::WIDTH), output_width); - output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::HEIGHT), output_height); - output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier); + output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::HEIGHT), + output_height); + output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::CHANNEL), + input_channels * depth_multiplier); return output_descriptor; } bool FusedDepthwiseConvolutionBatchNormalizationNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); diff --git a/src/graph/nodes/GenerateProposalsLayerNode.cpp b/src/graph/nodes/GenerateProposalsLayerNode.cpp index 9f36862818..1671a47a95 100644 --- a/src/graph/nodes/GenerateProposalsLayerNode.cpp +++ b/src/graph/nodes/GenerateProposalsLayerNode.cpp @@ -23,17 +23,15 @@ */ #include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h" +#include "arm_compute/core/Helpers.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/INodeVisitor.h" -#include "arm_compute/core/Helpers.h" - namespace arm_compute { namespace graph { -GenerateProposalsLayerNode::GenerateProposalsLayerNode(GenerateProposalsInfo &info) - : _info(info) +GenerateProposalsLayerNode::GenerateProposalsLayerNode(GenerateProposalsInfo &info) : _info(info) { _input_edges.resize(3, EmptyEdgeID); _outputs.resize(3, NullTensorID); @@ -46,10 +44,10 @@ const GenerateProposalsInfo &GenerateProposalsLayerNode::info() const bool GenerateProposalsLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID) - && (output_id(2) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && + (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID) && (output_id(2) != NullTensorID)) { - for(unsigned int i = 0; i < 3; ++i) + for (unsigned int i = 0; i < 3; ++i) { Tensor *dst = output(i); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -68,7 +66,7 @@ TensorDescriptor GenerateProposalsLayerNode::configure_output(size_t idx) const ARM_COMPUTE_ERROR_ON(src == nullptr); TensorDescriptor output_desc = src->desc(); - switch(idx) + switch (idx) { case 0: // Configure proposals output diff --git a/src/graph/nodes/InputNode.cpp b/src/graph/nodes/InputNode.cpp index 072281f259..7408bc265d 100644 --- a/src/graph/nodes/InputNode.cpp +++ b/src/graph/nodes/InputNode.cpp @@ -30,15 +30,14 @@ namespace arm_compute { namespace graph { -InputNode::InputNode(TensorDescriptor desc) - : _desc(std::move(desc)) +InputNode::InputNode(TensorDescriptor desc) : _desc(std::move(desc)) { _outputs.resize(1, NullTensorID); } bool InputNode::forward_descriptors() { - if(output_id(0) != NullTensorID) + if (output_id(0) != NullTensorID) { Tensor *t = output(0); ARM_COMPUTE_ERROR_ON(t == nullptr); diff --git a/src/graph/nodes/L2NormalizeLayerNode.cpp b/src/graph/nodes/L2NormalizeLayerNode.cpp index 0c35a335fa..1a57cf0199 100644 --- a/src/graph/nodes/L2NormalizeLayerNode.cpp +++ b/src/graph/nodes/L2NormalizeLayerNode.cpp @@ -30,18 +30,15 @@ namespace arm_compute { namespace graph { -L2NormalizeLayerNode::L2NormalizeLayerNode() - : L2NormalizeLayerNode(0, 1e-12f) +L2NormalizeLayerNode::L2NormalizeLayerNode() : L2NormalizeLayerNode(0, 1e-12f) { } -L2NormalizeLayerNode::L2NormalizeLayerNode(int axis) - : L2NormalizeLayerNode(axis, 1e-12f) +L2NormalizeLayerNode::L2NormalizeLayerNode(int axis) : L2NormalizeLayerNode(axis, 1e-12f) { } -L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon) - : _axis(axis), _epsilon(epsilon) +L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon) : _axis(axis), _epsilon(epsilon) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -49,7 +46,7 @@ L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon) bool L2NormalizeLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -92,4 +89,4 @@ void L2NormalizeLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/NormalizationLayerNode.cpp b/src/graph/nodes/NormalizationLayerNode.cpp index eaa1bcf924..b18bb7dd93 100644 --- a/src/graph/nodes/NormalizationLayerNode.cpp +++ b/src/graph/nodes/NormalizationLayerNode.cpp @@ -31,8 +31,7 @@ namespace arm_compute { namespace graph { -NormalizationLayerNode::NormalizationLayerNode(NormalizationLayerInfo norm_info) - : _info(norm_info) +NormalizationLayerNode::NormalizationLayerNode(NormalizationLayerInfo norm_info) : _info(norm_info) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -45,7 +44,7 @@ NormalizationLayerInfo NormalizationLayerNode::normalization_info() const bool NormalizationLayerNode::forward_descriptors() { - if(input_id(0) != NullTensorID && (output_id(0) != NullTensorID)) + if (input_id(0) != NullTensorID && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -76,4 +75,4 @@ void NormalizationLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp index 113d0a541f..cac96606ea 100644 --- a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp +++ b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp @@ -39,7 +39,7 @@ NormalizePlanarYUVLayerNode::NormalizePlanarYUVLayerNode() bool NormalizePlanarYUVLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); diff --git a/src/graph/nodes/PReluLayerNode.cpp b/src/graph/nodes/PReluLayerNode.cpp index 378c18e3bb..2b50fe9234 100644 --- a/src/graph/nodes/PReluLayerNode.cpp +++ b/src/graph/nodes/PReluLayerNode.cpp @@ -38,7 +38,7 @@ PReluLayerNode::PReluLayerNode() bool PReluLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); diff --git a/src/graph/nodes/PadLayerNode.cpp b/src/graph/nodes/PadLayerNode.cpp index 6424370d41..336e7de05a 100644 --- a/src/graph/nodes/PadLayerNode.cpp +++ b/src/graph/nodes/PadLayerNode.cpp @@ -23,17 +23,15 @@ */ #include "arm_compute/graph/nodes/PadLayerNode.h" +#include "arm_compute/core/Helpers.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/INodeVisitor.h" -#include "arm_compute/core/Helpers.h" - namespace arm_compute { namespace graph { -PadLayerNode::PadLayerNode(const PaddingList &padding, PixelValue pad_value) - : _padding(padding), _pad_value(pad_value) +PadLayerNode::PadLayerNode(const PaddingList &padding, PixelValue pad_value) : _padding(padding), _pad_value(pad_value) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -51,7 +49,7 @@ PixelValue PadLayerNode::pad_value() const bool PadLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -71,7 +69,7 @@ TensorDescriptor PadLayerNode::configure_output(size_t idx) const TensorDescriptor output_desc = src->desc(); const TensorShape input_shape = src->desc().shape; - for(size_t dim = 0; dim < _padding.size(); ++dim) + for (size_t dim = 0; dim < _padding.size(); ++dim) { output_desc.shape.set(dim, _padding[dim].first + input_shape[dim] + _padding[dim].second); } diff --git a/src/graph/nodes/PermuteLayerNode.cpp b/src/graph/nodes/PermuteLayerNode.cpp index b311ee1301..db53722363 100644 --- a/src/graph/nodes/PermuteLayerNode.cpp +++ b/src/graph/nodes/PermuteLayerNode.cpp @@ -23,17 +23,15 @@ */ #include "arm_compute/graph/nodes/PermuteLayerNode.h" +#include "arm_compute/core/Helpers.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/INodeVisitor.h" -#include "arm_compute/core/Helpers.h" - namespace arm_compute { namespace graph { -PermuteLayerNode::PermuteLayerNode(PermutationVector perm, DataLayout layout) - : _perm(perm), _layout(layout) +PermuteLayerNode::PermuteLayerNode(PermutationVector perm, DataLayout layout) : _perm(perm), _layout(layout) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -46,7 +44,7 @@ const PermutationVector &PermuteLayerNode::permutation_vector() const bool PermuteLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -66,7 +64,7 @@ TensorDescriptor PermuteLayerNode::configure_output(size_t idx) const TensorDescriptor output_desc = src->desc(); permute(output_desc.shape, _perm); - if(_layout != DataLayout::UNKNOWN) + if (_layout != DataLayout::UNKNOWN) { output_desc.layout = _layout; } @@ -84,4 +82,4 @@ void PermuteLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/PoolingLayerNode.cpp b/src/graph/nodes/PoolingLayerNode.cpp index 4ecf924a5e..ac954acbe3 100644 --- a/src/graph/nodes/PoolingLayerNode.cpp +++ b/src/graph/nodes/PoolingLayerNode.cpp @@ -32,8 +32,7 @@ namespace arm_compute { namespace graph { -PoolingLayerNode::PoolingLayerNode(PoolingLayerInfo pool_info) - : _info(std::move(pool_info)) +PoolingLayerNode::PoolingLayerNode(PoolingLayerInfo pool_info) : _info(std::move(pool_info)) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -55,7 +54,8 @@ TensorDescriptor PoolingLayerNode::compute_output_descriptor(const TensorDescrip const unsigned int pool_size_x = info.is_global_pooling ? input_width : info.pool_size.width; const unsigned int pool_size_y = info.is_global_pooling ? input_height : info.pool_size.height; - std::tie(pooled_width, pooled_height) = scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info); + std::tie(pooled_width, pooled_height) = + scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info); const DataLayout data_layout = input_descriptor.layout; TensorDescriptor output_descriptor = input_descriptor; @@ -67,7 +67,7 @@ TensorDescriptor PoolingLayerNode::compute_output_descriptor(const TensorDescrip bool PoolingLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -98,4 +98,4 @@ void PoolingLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/PrintLayerNode.cpp b/src/graph/nodes/PrintLayerNode.cpp index da408d8c4d..82a340005b 100644 --- a/src/graph/nodes/PrintLayerNode.cpp +++ b/src/graph/nodes/PrintLayerNode.cpp @@ -32,7 +32,9 @@ namespace arm_compute { namespace graph { -PrintLayerNode::PrintLayerNode(std::ostream &stream, const IOFormatInfo &format_info, const std::function transform) +PrintLayerNode::PrintLayerNode(std::ostream &stream, + const IOFormatInfo &format_info, + const std::function transform) : _stream(stream), _format_info(format_info), _transform(transform) { _input_edges.resize(1, EmptyEdgeID); @@ -56,7 +58,7 @@ const std::function PrintLayerNode::transform() const bool PrintLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -88,4 +90,4 @@ void PrintLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/PriorBoxLayerNode.cpp b/src/graph/nodes/PriorBoxLayerNode.cpp index f017ead880..5ffb173333 100644 --- a/src/graph/nodes/PriorBoxLayerNode.cpp +++ b/src/graph/nodes/PriorBoxLayerNode.cpp @@ -32,8 +32,7 @@ namespace arm_compute { namespace graph { -PriorBoxLayerNode::PriorBoxLayerNode(PriorBoxLayerInfo prior_info) - : _info(std::move(prior_info)) +PriorBoxLayerNode::PriorBoxLayerNode(PriorBoxLayerInfo prior_info) : _info(std::move(prior_info)) { _input_edges.resize(2, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -44,7 +43,7 @@ PriorBoxLayerInfo PriorBoxLayerNode::priorbox_info() const return _info; } -TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, +TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, const PriorBoxLayerInfo &info) { const unsigned int layer_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH); @@ -61,7 +60,7 @@ TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescri bool PriorBoxLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); diff --git a/src/graph/nodes/QuantizationLayerNode.cpp b/src/graph/nodes/QuantizationLayerNode.cpp index 4906808dae..0dd2da919d 100644 --- a/src/graph/nodes/QuantizationLayerNode.cpp +++ b/src/graph/nodes/QuantizationLayerNode.cpp @@ -47,7 +47,7 @@ QuantizationLayerNode::QuantizationLayerNode(QuantizationInfo out_quant_info, Da bool QuantizationLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); diff --git a/src/graph/nodes/ROIAlignLayerNode.cpp b/src/graph/nodes/ROIAlignLayerNode.cpp index 62891811f3..5909335826 100644 --- a/src/graph/nodes/ROIAlignLayerNode.cpp +++ b/src/graph/nodes/ROIAlignLayerNode.cpp @@ -24,17 +24,15 @@ #include "arm_compute/graph/nodes/ROIAlignLayerNode.h" +#include "arm_compute/core/Helpers.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/INodeVisitor.h" -#include "arm_compute/core/Helpers.h" - namespace arm_compute { namespace graph { -ROIAlignLayerNode::ROIAlignLayerNode(ROIPoolingLayerInfo &pool_info) - : _pool_info(pool_info) +ROIAlignLayerNode::ROIAlignLayerNode(ROIPoolingLayerInfo &pool_info) : _pool_info(pool_info) { _input_edges.resize(2, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -47,7 +45,7 @@ const ROIPoolingLayerInfo &ROIAlignLayerNode::pooling_info() const bool ROIAlignLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -92,4 +90,4 @@ void ROIAlignLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/ReductionLayerNode.cpp b/src/graph/nodes/ReductionLayerNode.cpp index 0e93039894..965c1ba0a5 100644 --- a/src/graph/nodes/ReductionLayerNode.cpp +++ b/src/graph/nodes/ReductionLayerNode.cpp @@ -56,7 +56,7 @@ bool ReductionLayerNode::keep_dims() const bool ReductionLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -74,8 +74,9 @@ TensorDescriptor ReductionLayerNode::configure_output(size_t idx) const const Tensor *src = input(0); ARM_COMPUTE_ERROR_ON(src == nullptr); - TensorDescriptor output_info = src->desc(); - TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, _keep_dims); + TensorDescriptor output_info = src->desc(); + TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, _keep_dims); output_info.set_shape(output_shape); return output_info; @@ -91,4 +92,4 @@ void ReductionLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/ReorgLayerNode.cpp b/src/graph/nodes/ReorgLayerNode.cpp index e693e4b931..251a4ea1b2 100644 --- a/src/graph/nodes/ReorgLayerNode.cpp +++ b/src/graph/nodes/ReorgLayerNode.cpp @@ -31,8 +31,7 @@ namespace arm_compute { namespace graph { -ReorgLayerNode::ReorgLayerNode(int stride) - : _stride(stride) +ReorgLayerNode::ReorgLayerNode(int stride) : _stride(stride) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -51,20 +50,22 @@ TensorDescriptor ReorgLayerNode::compute_output_descriptor(const TensorDescripto ARM_COMPUTE_ERROR_ON(stride <= 0); ARM_COMPUTE_ERROR_ON_MSG((input_width % stride != 0), "The width of the input tensor must be a multiple of stride"); - ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0), "The height of the input tensor must be a multiple of stride"); + ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0), + "The height of the input tensor must be a multiple of stride"); const DataLayout data_layout = input_descriptor.layout; TensorDescriptor output_descriptor = input_descriptor; output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), input_width / stride); output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), input_height / stride); - output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channel * stride * stride); + output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), + input_channel * stride * stride); return output_descriptor; } bool ReorgLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -95,4 +96,4 @@ void ReorgLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/ReshapeLayer.cpp b/src/graph/nodes/ReshapeLayer.cpp index a6354d03ed..ce6bf9b803 100644 --- a/src/graph/nodes/ReshapeLayer.cpp +++ b/src/graph/nodes/ReshapeLayer.cpp @@ -21,17 +21,15 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/graph/nodes/ReshapeLayerNode.h" - #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/INodeVisitor.h" +#include "arm_compute/graph/nodes/ReshapeLayerNode.h" namespace arm_compute { namespace graph { -ReshapeLayerNode::ReshapeLayerNode(TensorShape shape) - : _shape(shape) +ReshapeLayerNode::ReshapeLayerNode(TensorShape shape) : _shape(shape) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -39,7 +37,7 @@ ReshapeLayerNode::ReshapeLayerNode(TensorShape shape) bool ReshapeLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -73,4 +71,4 @@ void ReshapeLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/ResizeLayerNode.cpp b/src/graph/nodes/ResizeLayerNode.cpp index 2a94bf6063..292b2c643e 100644 --- a/src/graph/nodes/ResizeLayerNode.cpp +++ b/src/graph/nodes/ResizeLayerNode.cpp @@ -50,7 +50,7 @@ std::pair ResizeLayerNode::scaling_factor() const bool ResizeLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -88,4 +88,4 @@ void ResizeLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/SliceLayerNode.cpp b/src/graph/nodes/SliceLayerNode.cpp index b7655b9eae..eb877d9a24 100644 --- a/src/graph/nodes/SliceLayerNode.cpp +++ b/src/graph/nodes/SliceLayerNode.cpp @@ -32,8 +32,7 @@ namespace arm_compute { namespace graph { -SliceLayerNode::SliceLayerNode(const Coordinates &starts, const Coordinates &ends) - : _starts(starts), _ends(ends) +SliceLayerNode::SliceLayerNode(const Coordinates &starts, const Coordinates &ends) : _starts(starts), _ends(ends) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -50,19 +49,20 @@ Coordinates SliceLayerNode::ends() const } TensorDescriptor SliceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, - const Coordinates &starts, const Coordinates &ends) + const Coordinates &starts, + const Coordinates &ends) { using namespace arm_compute::helpers::tensor_transform; TensorDescriptor output_desc = input_descriptor; - output_desc.shape = arm_compute::misc::shape_calculator::compute_slice_shape(input_descriptor.shape, starts, ends); + output_desc.shape = arm_compute::misc::shape_calculator::compute_slice_shape(input_descriptor.shape, starts, ends); return output_desc; } bool SliceLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); diff --git a/src/graph/nodes/SoftmaxLayerNode.cpp b/src/graph/nodes/SoftmaxLayerNode.cpp index 031166993a..4beac81b1f 100644 --- a/src/graph/nodes/SoftmaxLayerNode.cpp +++ b/src/graph/nodes/SoftmaxLayerNode.cpp @@ -31,8 +31,7 @@ namespace arm_compute { namespace graph { -SoftmaxLayerNode::SoftmaxLayerNode(float beta) - : _beta(beta) +SoftmaxLayerNode::SoftmaxLayerNode(float beta) : _beta(beta) { _input_edges.resize(1, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -45,7 +44,7 @@ float SoftmaxLayerNode::beta() const bool SoftmaxLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); @@ -79,4 +78,4 @@ void SoftmaxLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/SplitLayerNode.cpp b/src/graph/nodes/SplitLayerNode.cpp index 31931c3a79..dfb6624f80 100644 --- a/src/graph/nodes/SplitLayerNode.cpp +++ b/src/graph/nodes/SplitLayerNode.cpp @@ -49,8 +49,8 @@ unsigned int SplitLayerNode::axis() const return _axis; } -std::pair SplitLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, - unsigned int num_splits, int axis, unsigned int idx) +std::pair SplitLayerNode::compute_output_descriptor( + const TensorDescriptor &input_descriptor, unsigned int num_splits, int axis, unsigned int idx) { // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis). int num_dimension = static_cast(input_descriptor.shape.num_dimensions()); @@ -58,7 +58,7 @@ std::pair SplitLayerNode::compute_output_descript Coordinates coords; TensorDescriptor output_descriptor = input_descriptor; int split_size = input_descriptor.shape[tmp_axis] / num_splits; - if(_size_splits.empty()) + if (_size_splits.empty()) { output_descriptor.shape.set(tmp_axis, split_size); coords.set(tmp_axis, idx * split_size); @@ -66,15 +66,15 @@ std::pair SplitLayerNode::compute_output_descript else { int split_size = _size_splits[idx]; - if(split_size == -1) + if (split_size == -1) { split_size = input_descriptor.shape[tmp_axis]; - for(unsigned int i = 0; i < _size_splits.size() - 1; ++i) + for (unsigned int i = 0; i < _size_splits.size() - 1; ++i) split_size -= _size_splits[i]; } output_descriptor.shape.set(tmp_axis, split_size); int coord_value = 0; - for(unsigned int i = 0; i < idx; ++i) + for (unsigned int i = 0; i < idx; ++i) coord_value += _size_splits[i]; coords.set(tmp_axis, coord_value); } @@ -84,12 +84,12 @@ std::pair SplitLayerNode::compute_output_descript bool SplitLayerNode::forward_descriptors() { - if(input_id(0) != NullTensorID) + if (input_id(0) != NullTensorID) { validate(); - for(unsigned int i = 0; i < _outputs.size(); ++i) + for (unsigned int i = 0; i < _outputs.size(); ++i) { - if(output_id(i) != NullTensorID) + if (output_id(i) != NullTensorID) { Tensor *dst_i = output(i); ARM_COMPUTE_ERROR_ON(dst_i == nullptr); @@ -117,10 +117,10 @@ TensorDescriptor SplitLayerNode::configure_output(size_t idx) const int tmp_axis = wrap_around(_axis, num_dimension); int split_size = (_size_splits.empty()) ? (input_descriptor.shape[tmp_axis] / _num_splits) : _size_splits[idx]; - if(split_size == -1) + if (split_size == -1) { split_size = input_descriptor.shape[tmp_axis]; - for(unsigned int i = 0; i < _size_splits.size() - 1; ++i) + for (unsigned int i = 0; i < _size_splits.size() - 1; ++i) split_size -= _size_splits[i]; } output_descriptor.shape.set(tmp_axis, split_size); @@ -138,7 +138,7 @@ Status SplitLayerNode::validate() const // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis). int tmp_axis = wrap_around(_axis, num_dimension); - if(_size_splits.empty()) + if (_size_splits.empty()) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->desc().shape[tmp_axis] % _num_splits, "Split should be exact"); } @@ -156,4 +156,4 @@ void SplitLayerNode::accept(INodeVisitor &v) v.visit(*this); } } // namespace graph -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/graph/nodes/StackLayerNode.cpp b/src/graph/nodes/StackLayerNode.cpp index f292b33ad0..031d8fc739 100644 --- a/src/graph/nodes/StackLayerNode.cpp +++ b/src/graph/nodes/StackLayerNode.cpp @@ -25,18 +25,16 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/graph/Graph.h" #include "arm_compute/graph/INodeVisitor.h" #include "arm_compute/graph/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - namespace arm_compute { namespace graph { -StackLayerNode::StackLayerNode(unsigned int total_nodes, int axis) - : _total_nodes(total_nodes), _axis(axis) +StackLayerNode::StackLayerNode(unsigned int total_nodes, int axis) : _total_nodes(total_nodes), _axis(axis) { _input_edges.resize(_total_nodes, EmptyEdgeID); _outputs.resize(1, NullTensorID); @@ -64,7 +62,7 @@ TensorDescriptor StackLayerNode::compute_output_descriptor(const std::vector= _outputs.size()); // Check if all input tensors are set - bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid) - { - return eid != EmptyEdgeID; - }); + bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), + [](const EdgeID &eid) { return eid != EmptyEdgeID; }); TensorDescriptor output_info = {}; - if(are_all_inputs_set) + if (are_all_inputs_set) { std::vector inputs_descriptors; - for(unsigned int i = 0; i < _input_edges.size(); ++i) + for (unsigned int i = 0; i < _input_edges.size(); ++i) { const Tensor *t = _graph->tensor(input_id(i)); ARM_COMPUTE_ERROR_ON(t == nullptr); diff --git a/src/graph/nodes/StridedSliceLayerNode.cpp b/src/graph/nodes/StridedSliceLayerNode.cpp index 6a1a724bb3..fc9f72204c 100644 --- a/src/graph/nodes/StridedSliceLayerNode.cpp +++ b/src/graph/nodes/StridedSliceLayerNode.cpp @@ -79,7 +79,7 @@ TensorDescriptor StridedSliceLayerNode::compute_output_descriptor(const TensorDe bool StridedSliceLayerNode::forward_descriptors() { - if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) + if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID)) { Tensor *dst = output(0); ARM_COMPUTE_ERROR_ON(dst == nullptr); diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp index 9c7c4248bb..5587ed23f0 100644 --- a/src/graph/printers/DotGraphPrinter.cpp +++ b/src/graph/printers/DotGraphPrinter.cpp @@ -25,9 +25,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/graph/Graph.h" +#include "arm_compute/graph/nodes/Nodes.h" #include "arm_compute/graph/Tensor.h" #include "arm_compute/graph/TypePrinter.h" -#include "arm_compute/graph/nodes/Nodes.h" namespace arm_compute { @@ -152,9 +152,9 @@ void DotGraphPrinter::print_footer(const Graph &g, std::ostream &os) void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os) { - for(const auto &n : g.nodes()) + for (const auto &n : g.nodes()) { - if(n) + if (n) { // Output node id std::string node_id = std::string("n") + support::cpp11::to_string(n->id()); @@ -166,7 +166,8 @@ void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os) std::string name = n->name().empty() ? node_id : n->name(); auto node_description = _dot_node_visitor.info(); - os << R"([label = ")" << name << R"( \n )" << n->assigned_target() << R"( \n )" << node_description << R"("])"; + os << R"([label = ")" << name << R"( \n )" << n->assigned_target() << R"( \n )" << node_description + << R"("])"; os << ";\n"; } } @@ -174,16 +175,17 @@ void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os) void DotGraphPrinter::print_edges(const Graph &g, std::ostream &os) { - for(const auto &e : g.edges()) + for (const auto &e : g.edges()) { - if(e) + if (e) { std::string source_node_id = std::string("n") + support::cpp11::to_string(e->producer_id()); std::string sink_node_id = std::string("n") + support::cpp11::to_string(e->consumer_id()); os << source_node_id << " -> " << sink_node_id << " "; const Tensor *t = e->tensor(); ARM_COMPUTE_ERROR_ON(t == nullptr); - os << R"([label = ")" << t->desc().shape << R"( \n )" << t->desc().data_type << R"( \n )" << t->desc().layout << R"("])"; + os << R"([label = ")" << t->desc().shape << R"( \n )" << t->desc().data_type << R"( \n )" + << t->desc().layout << R"("])"; os << ";\n"; } } diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp index ef7c62d64b..eca712dbf0 100644 --- a/src/runtime/Allocator.cpp +++ b/src/runtime/Allocator.cpp @@ -22,9 +22,9 @@ * SOFTWARE. */ #include "arm_compute/runtime/Allocator.h" -#include "arm_compute/runtime/MemoryRegion.h" #include "arm_compute/core/Error.h" +#include "arm_compute/runtime/MemoryRegion.h" #include diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp index bea55d8eb9..8a0fc05c39 100644 --- a/src/runtime/BlobLifetimeManager.cpp +++ b/src/runtime/BlobLifetimeManager.cpp @@ -35,8 +35,7 @@ namespace arm_compute { -BlobLifetimeManager::BlobLifetimeManager() - : _blobs() +BlobLifetimeManager::BlobLifetimeManager() : _blobs() { } @@ -62,33 +61,32 @@ void BlobLifetimeManager::update_blobs_and_mappings() ARM_COMPUTE_ERROR_ON(_active_group == nullptr); // Sort free blobs requirements in descending order. - _free_blobs.sort([](const Blob & ba, const Blob & bb) - { - return ba.max_size > bb.max_size; - }); + _free_blobs.sort([](const Blob &ba, const Blob &bb) { return ba.max_size > bb.max_size; }); // Create group sizes vector std::vector group_sizes; - std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b) - { - return BlobInfo{ b.max_size, b.max_alignment, b.bound_elements.size() }; - }); + std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), + [](const Blob &b) { + return BlobInfo{b.max_size, b.max_alignment, b.bound_elements.size()}; + }); // Update blob sizes size_t max_size = std::max(_blobs.size(), group_sizes.size()); _blobs.resize(max_size); group_sizes.resize(max_size); - std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs) - { - return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), std::max(lhs.owners, rhs.owners) }; - }); + std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), + [](BlobInfo lhs, BlobInfo rhs) + { + return BlobInfo{std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), + std::max(lhs.owners, rhs.owners)}; + }); // Calculate group mappings auto &group_mappings = _active_group->mappings(); int blob_idx = 0; - for(auto &free_blob : _free_blobs) + for (auto &free_blob : _free_blobs) { - for(auto &bound_element_id : free_blob.bound_elements) + for (auto &bound_element_id : free_blob.bound_elements) { ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements)); Element &bound_element = _active_elements[bound_element_id]; diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp index 88e280537c..a2f63ef52b 100644 --- a/src/runtime/BlobMemoryPool.cpp +++ b/src/runtime/BlobMemoryPool.cpp @@ -47,7 +47,7 @@ BlobMemoryPool::~BlobMemoryPool() void BlobMemoryPool::acquire(MemoryMappings &handles) { // Set memory to handlers - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(_blobs[handle.second].get()); @@ -56,7 +56,7 @@ void BlobMemoryPool::acquire(MemoryMappings &handles) void BlobMemoryPool::release(MemoryMappings &handles) { - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(nullptr); @@ -78,7 +78,7 @@ void BlobMemoryPool::allocate_blobs(const std::vector &blob_info) { ARM_COMPUTE_ERROR_ON(!_allocator); - for(const auto &bi : blob_info) + for (const auto &bi : blob_info) { _blobs.push_back(_allocator->make_region(bi.size, bi.alignment)); } diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp index e06ef3d37d..b4545b93bf 100644 --- a/src/runtime/CL/CLBufferAllocator.cpp +++ b/src/runtime/CL/CLBufferAllocator.cpp @@ -35,7 +35,8 @@ namespace arm_compute void *CLBufferAllocator::allocate(size_t size, size_t alignment) { ARM_COMPUTE_UNUSED(alignment); - cl_mem buf{ clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr) }; + cl_mem buf{clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, + nullptr, nullptr)}; return static_cast(buf); } diff --git a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp index 7168259fcd..d680dc08bb 100644 --- a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp +++ b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp @@ -27,8 +27,7 @@ namespace arm_compute { -CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() - : _heuristics(std::make_unique()) +CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() : _heuristics(std::make_unique()) { } CLGEMMHeuristicsHandle::~CLGEMMHeuristicsHandle() = default; diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp index 5b4bbbcde0..eb28ecbf8d 100644 --- a/src/runtime/CL/CLHelpers.cpp +++ b/src/runtime/CL/CLHelpers.cpp @@ -50,34 +50,30 @@ void printf_callback(const char *buffer, unsigned int len, size_t complete, void * @return A pointer to the context properties which can be used to create an opencl context */ -void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array &prop) +void initialise_context_properties(const cl::Platform &platform, + const cl::Device &device, + std::array &prop) { ARM_COMPUTE_UNUSED(device); #if defined(ARM_COMPUTE_ASSERTS_ENABLED) // Query devices in the context for cl_arm_printf support - if(arm_compute::device_supports_extension(device, "cl_arm_printf")) + if (arm_compute::device_supports_extension(device, "cl_arm_printf")) { // Create a cl_context with a printf_callback and user specified buffer size. - std::array properties_printf = - { + std::array properties_printf = { CL_CONTEXT_PLATFORM, reinterpret_cast(platform()), // Enable a printf callback function for this context. CL_PRINTF_CALLBACK_ARM, reinterpret_cast(printf_callback), // Request a minimum printf buffer size of 4MB for devices in the // context that support this extension. - CL_PRINTF_BUFFERSIZE_ARM, 0x1000, - 0 - }; + CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0}; prop = properties_printf; } else #endif // defined(ARM_COMPUTE_ASSERTS_ENABLED) { - std::array properties = - { - CL_CONTEXT_PLATFORM, reinterpret_cast(platform()), - 0 - }; + std::array properties = {CL_CONTEXT_PLATFORM, + reinterpret_cast(platform()), 0}; std::copy(properties.begin(), properties.end(), prop.begin()); }; } @@ -91,19 +87,19 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) cl::Platform::get(&platforms); ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform"); - cl::Platform selected_platform{ nullptr }; + cl::Platform selected_platform{nullptr}; // If the user has selected the Native platform, return the first available. - switch(cl_backend_type) + switch (cl_backend_type) { case CLBackendType::Native: selected_platform = platforms[0]; break; case CLBackendType::Clvk: - for(auto p : platforms) + for (auto p : platforms) { std::string res = p.getInfo(); - if(res.find("clvk") != std::string::npos) + if (res.find("clvk") != std::string::npos) { selected_platform = p; break; @@ -114,7 +110,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) ARM_COMPUTE_ERROR("Unsupported backend type"); } - if(!selected_platform()) + if (!selected_platform()) { ARM_COMPUTE_ERROR("No valid platform found"); } @@ -122,8 +118,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) return selected_platform; } -std::tuple -create_opencl_context_and_device(CLBackendType cl_backend_type) +std::tuple create_opencl_context_and_device(CLBackendType cl_backend_type) { ARM_COMPUTE_ERROR_ON(!opencl_is_available()); cl::Platform p = select_preferable_platform(cl_backend_type); @@ -131,9 +126,9 @@ create_opencl_context_and_device(CLBackendType cl_backend_type) std::vector platform_devices; p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices); ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device"); - device = platform_devices[0]; - cl_int err = CL_SUCCESS; - std::array properties = { 0, 0, 0, 0, 0, 0, 0 }; + device = platform_devices[0]; + cl_int err = CL_SUCCESS; + std::array properties = {0, 0, 0, 0, 0, 0, 0}; initialise_context_properties(p, device, properties); cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err); ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context"); @@ -143,7 +138,7 @@ create_opencl_context_and_device(CLBackendType cl_backend_type) void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush) { ARM_COMPUTE_ERROR_ON_NULLPTR(kernel); - if(ctx) + if (ctx) { ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr); ctx->gpu_scheduler()->enqueue(*kernel, flush); diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp index a1743c56e6..c6ee6fde83 100644 --- a/src/runtime/CL/CLMemory.cpp +++ b/src/runtime/CL/CLMemory.cpp @@ -24,24 +24,22 @@ #include "arm_compute/runtime/CL/CLMemory.h" #include "arm_compute/core/Error.h" + #include "support/Cast.h" namespace arm_compute { -CLMemory::CLMemory() - : _region(nullptr), _region_owned(nullptr) +CLMemory::CLMemory() : _region(nullptr), _region_owned(nullptr) { } -CLMemory::CLMemory(const std::shared_ptr &memory) - : _region(nullptr), _region_owned(memory) +CLMemory::CLMemory(const std::shared_ptr &memory) : _region(nullptr), _region_owned(memory) { _region_owned = memory; _region = _region_owned.get(); } -CLMemory::CLMemory(ICLMemoryRegion *memory) - : _region(memory), _region_owned(nullptr) +CLMemory::CLMemory(ICLMemoryRegion *memory) : _region(memory), _region_owned(nullptr) { _region = memory; } @@ -78,4 +76,4 @@ void CLMemory::set_owned_region(std::unique_ptr region) _region_owned = utils::cast::polymorphic_downcast_unique_ptr(std::move(region)); _region = _region_owned.get(); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp index 00f91a0ffb..835958b816 100644 --- a/src/runtime/CL/CLMemoryRegion.cpp +++ b/src/runtime/CL/CLMemoryRegion.cpp @@ -29,10 +29,7 @@ namespace arm_compute { ICLMemoryRegion::ICLMemoryRegion(size_t size) - : IMemoryRegion(size), - _ctx(CLScheduler::get().context()), - _mapping(nullptr), - _mem() + : IMemoryRegion(size), _ctx(CLScheduler::get().context()), _mapping(nullptr), _mem() { } @@ -57,17 +54,15 @@ std::unique_ptr ICLMemoryRegion::extract_subregion(size_t offset, return nullptr; } -CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) - : ICLMemoryRegion(size) +CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) : ICLMemoryRegion(size) { - if(_size != 0) + if (_size != 0) { _mem = cl::Buffer(CLScheduler::get().context(), flags, _size); } } -CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) - : ICLMemoryRegion(buffer.getInfo()) +CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) : ICLMemoryRegion(buffer.getInfo()) { _mem = buffer; } @@ -102,10 +97,10 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q) ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment) : ICLMemoryRegion(size), _ptr(nullptr) { - if(size != 0) + if (size != 0) { _ptr = clSVMAlloc(CLScheduler::get().context().get(), flags, size, alignment); - if(_ptr != nullptr) + if (_ptr != nullptr) { _mem = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr); } @@ -114,7 +109,7 @@ ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t a ICLSVMMemoryRegion::~ICLSVMMemoryRegion() { - if(_ptr != nullptr) + if (_ptr != nullptr) { try { @@ -125,7 +120,7 @@ ICLSVMMemoryRegion::~ICLSVMMemoryRegion() _mem = cl::Buffer(); clSVMFree(_ctx.get(), _ptr); } - catch(...) + catch (...) { } } @@ -144,7 +139,8 @@ CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl_mem_flags flags, size_t size void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking) { ARM_COMPUTE_ERROR_ON(_ptr == nullptr); - clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr); + clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, + nullptr); _mapping = _ptr; return _mapping; } @@ -163,7 +159,7 @@ CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl_mem_flags flags, size_t size, si void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking) { - if(blocking) + if (blocking) { clFinish(q.get()); } diff --git a/src/runtime/CL/CLOperator.cpp b/src/runtime/CL/CLOperator.cpp index 075a544077..89d4520038 100644 --- a/src/runtime/CL/CLOperator.cpp +++ b/src/runtime/CL/CLOperator.cpp @@ -30,14 +30,13 @@ namespace arm_compute { namespace experimental { -ICLOperator::ICLOperator(IRuntimeContext *ctx) - : _kernel(), _ctx(ctx), _workspace() +ICLOperator::ICLOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace() { } void ICLOperator::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp index 5083b4b0c5..b426b8c304 100644 --- a/src/runtime/CL/CLRuntimeContext.cpp +++ b/src/runtime/CL/CLRuntimeContext.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/CLRuntimeContext.h" + #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" @@ -29,7 +30,10 @@ namespace arm_compute { CLRuntimeContext::CLRuntimeContext() - : _gpu_owned_scheduler(std::make_unique()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _backend_type() + : _gpu_owned_scheduler(std::make_unique()), + _gpu_scheduler(_gpu_owned_scheduler.get()), + _symbols(), + _backend_type() { _symbols.load_default(); auto ctx_dev_err = create_opencl_context_and_device(_backend_type); diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp index b7a4dff45d..f0a42f55fd 100644 --- a/src/runtime/CL/CLScheduler.cpp +++ b/src/runtime/CL/CLScheduler.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLTuner.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -81,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event() void CLScheduler::tune_kernel_static(ICLKernel &kernel) { - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { _cl_tuner->tune_kernel_static(kernel); } @@ -95,8 +96,16 @@ bool CLScheduler::is_initialised() const std::once_flag CLScheduler::_initialize_symbols; CLScheduler::CLScheduler() - : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _gemm_heuristics(nullptr), _backend_type(CLBackendType::Native), _job_chaining_enabled(true), - _job_chaining_size(1), _job_chaining_count(0) + : _context(), + _queue(), + _target(GPUTarget::MIDGARD), + _is_initialised(false), + _cl_tuner(nullptr), + _gemm_heuristics(nullptr), + _backend_type(CLBackendType::Native), + _job_chaining_enabled(true), + _job_chaining_size(1), + _job_chaining_count(0) { } @@ -107,9 +116,12 @@ CLScheduler &CLScheduler::get() return scheduler; } -void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h) +void CLScheduler::default_init_with_context(cl::Device &device, + cl::Context &ctx, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h) { - if(!_is_initialised) + if (!_is_initialised) { const std::string cl_kernels_folder("./cl_kernels/"); cl::CommandQueue queue = cl::CommandQueue(ctx, device); @@ -121,7 +133,7 @@ void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) { - if(!_is_initialised) + if (!_is_initialised) { cl::Context ctx; cl::Device dev; @@ -151,7 +163,12 @@ void CLScheduler::set_context(cl::Context context) CLKernelLibrary::get().set_context(_context); } -void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) +void CLScheduler::init(cl::Context context, + cl::CommandQueue queue, + const cl::Device &device, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h, + CLBackendType cl_backend_type) { set_context(std::move(context)); _queue = std::move(queue); @@ -164,21 +181,21 @@ void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::De void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush) { - ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised, - "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ + ARM_COMPUTE_ERROR_ON_MSG( + !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!"); const bool inject_memory = !tensors.empty(); // Tune the kernel if the CLTuner has been provided - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel); } // Run kernel inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue); - if(_job_chaining_enabled) + if (_job_chaining_enabled) { ++_job_chaining_count; } @@ -188,9 +205,9 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f void CLScheduler::flush_queue(bool flush) { - if(_job_chaining_enabled) + if (_job_chaining_enabled) { - if(_job_chaining_count >= _job_chaining_size) + if (_job_chaining_count >= _job_chaining_size) { _job_chaining_count = 0; /* @@ -199,14 +216,14 @@ void CLScheduler::flush_queue(bool flush) the CPU activity for job-scheduling. For eg. job-chain size goes from 1, 2, 4, 8 and 16 */ - if(_job_chaining_size < 16) + if (_job_chaining_size < 16) { _job_chaining_size <<= 1; } _queue.flush(); } } - else if(flush) + else if (flush) { _queue.flush(); } diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp index 14936ae23c..ace820bbb7 100644 --- a/src/runtime/CL/CLSubTensor.cpp +++ b/src/runtime/CL/CLSubTensor.cpp @@ -29,12 +29,14 @@ using namespace arm_compute; -CLSubTensor::CLSubTensor() - : _parent(nullptr), _info() +CLSubTensor::CLSubTensor() : _parent(nullptr), _info() { } -CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent) +CLSubTensor::CLSubTensor(ICLTensor *parent, + const TensorShape &tensor_shape, + const Coordinates &coords, + bool extend_parent) : _parent(nullptr), _info() { ARM_COMPUTE_ERROR_ON(parent == nullptr); @@ -81,7 +83,7 @@ void CLSubTensor::unmap() uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking) { ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr); - if(_parent->buffer() == nullptr) + if (_parent->buffer() == nullptr) { _parent->map(q, blocking); } diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp index f85b8ae777..e6457218c7 100644 --- a/src/runtime/CL/CLTensorAllocator.cpp +++ b/src/runtime/CL/CLTensorAllocator.cpp @@ -46,17 +46,16 @@ static IAllocator *static_global_cl_allocator = nullptr; std::unique_ptr allocate_region(size_t size, cl_uint alignment) { // Try fine-grain SVM - std::unique_ptr region = std::make_unique(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, - size, - alignment); + std::unique_ptr region = + std::make_unique(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment); // Try coarse-grain SVM in case of failure - if(region != nullptr && region->ptr() == nullptr) + if (region != nullptr && region->ptr() == nullptr) { region = std::make_unique(CL_MEM_READ_WRITE, size, alignment); } // Try legacy buffer memory in case of failure - if(region != nullptr && region->ptr() == nullptr) + if (region != nullptr && region->ptr() == nullptr) { region = std::make_unique(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size); } @@ -80,7 +79,10 @@ void clear_quantization_arrays(CLFloatArray &scale, CLInt32Array &offset) * @param[in] qinfo Quantization info * @param[in] pad_size Pad size to use in case array needs to be padded for computation purposes */ -void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const QuantizationInfo &qinfo, size_t pad_size) +void populate_quantization_info(CLFloatArray &scale, + CLInt32Array &offset, + const QuantizationInfo &qinfo, + size_t pad_size) { clear_quantization_arrays(scale, offset); @@ -90,16 +92,18 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const const size_t element_size = sizeof(std::remove_reference::type::value_type); scale = CLFloatArray(num_elements + pad_size); scale.resize(num_elements); - CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data()); + CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, + qinfo.scale().data()); - if(!qinfo.offset().empty()) + if (!qinfo.offset().empty()) { // Create offset array - const std::vector &qoffset = qinfo.offset(); - const size_t offset_element_size = sizeof(std::remove_reference::type::value_type); - offset = CLInt32Array(num_elements + pad_size); + const std::vector &qoffset = qinfo.offset(); + const size_t offset_element_size = sizeof(std::remove_reference::type::value_type); + offset = CLInt32Array(num_elements + pad_size); offset.resize(num_elements); - CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, num_elements * offset_element_size, qinfo.offset().data()); + CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, + num_elements * offset_element_size, qinfo.offset().data()); } } } // namespace @@ -111,7 +115,7 @@ CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext CLQuantization CLTensorAllocator::quantization() const { - return { &_scale, &_offset }; + return {&_scale, &_offset}; } uint8_t *CLTensorAllocator::data() @@ -127,10 +131,10 @@ const cl::Buffer &CLTensorAllocator::cl_data() const void CLTensorAllocator::allocate() { // Allocate tensor backing memory - if(_associated_memory_group == nullptr) + if (_associated_memory_group == nullptr) { // Perform memory allocation - if(static_global_cl_allocator != nullptr) + if (static_global_cl_allocator != nullptr) { _memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0)); } @@ -146,7 +150,7 @@ void CLTensorAllocator::allocate() } // Allocate and fill the quantization parameter arrays - if(is_data_type_quantized_per_channel(info().data_type())) + if (is_data_type_quantized_per_channel(info().data_type())) { const size_t pad_size = 0; populate_quantization_info(_scale, _offset, info().quantization_info(), pad_size); @@ -193,7 +197,7 @@ void CLTensorAllocator::set_global_allocator(IAllocator *allocator) uint8_t *CLTensorAllocator::lock() { - if(_ctx) + if (_ctx) { return map(_ctx->gpu_scheduler()->queue(), true); } @@ -206,7 +210,7 @@ uint8_t *CLTensorAllocator::lock() void CLTensorAllocator::unlock() { ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr); - if(_ctx) + if (_ctx) { unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast(_memory.region()->buffer())); } diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 445638f01f..0d62fe3afe 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/CLTuner.h" -#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h" #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h" + #include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "support/StringSupport.h" @@ -37,19 +38,23 @@ namespace arm_compute { CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info) - : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info) + : real_clEnqueueNDRangeKernel(nullptr), + _tuning_params_table(), + _lws_table(), + _kernel_event(), + _tune_new_kernels(tune_new_kernels), + _tuning_info(tuning_info) { } struct CLTuner::IKernelData { - virtual ~IKernelData() = default; + virtual ~IKernelData() = default; virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0; }; struct DefaultKernelData : public CLTuner::IKernelData { - DefaultKernelData(ITensorPack &tensors) - : _tensors{ tensors } + DefaultKernelData(ITensorPack &tensors) : _tensors{tensors} { } ~DefaultKernelData() override = default; @@ -100,16 +105,17 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel) void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) { // Get the configuration ID from the kernel and append GPU target name and number of available compute units - const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units()); + const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units()); // Check if we need to find the Optimal LWS. If the kernel's config_id is equal to default_config_id, the kernel does not require to be tuned - if(kernel.config_id() != arm_compute::default_config_id) + if (kernel.config_id() != arm_compute::default_config_id) { auto p = _tuning_params_table.find(config_id); - if(p == _tuning_params_table.end()) + if (p == _tuning_params_table.end()) { - if(_tune_new_kernels) + if (_tune_new_kernels) { // Find the optimal LWS for the kernel CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data); @@ -119,7 +125,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) // Set Local-Workgroup-Size kernel.set_lws_hint(opt_tuning_params.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { kernel.set_wbsm_hint(opt_tuning_params.get_wbsm()); } @@ -129,7 +135,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) { // Set Local-Workgroup-Size kernel.set_lws_hint(p->second.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { kernel.set_wbsm_hint(p->second.get_wbsm()); } @@ -138,7 +144,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) } void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) { - DefaultKernelData data{ tensors }; + DefaultKernelData data{tensors}; do_tune_kernel_dynamic(kernel, &data); } @@ -154,7 +160,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat cl::CommandQueue queue_profiler; // Extract real OpenCL function to intercept - if(real_clEnqueueNDRangeKernel == nullptr) + if (real_clEnqueueNDRangeKernel == nullptr) { real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr; } @@ -165,7 +171,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat // Check if we can use the OpenCL timer with the default queue cl_command_queue_properties props = default_queue.getInfo(); - if((props & CL_QUEUE_PROFILING_ENABLE) == 0) + if ((props & CL_QUEUE_PROFILING_ENABLE) == 0) { // Set the queue for profiling queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE); @@ -176,21 +182,23 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat } // Start intercepting enqueues: - auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, cl_event * event) + auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, + const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event) { - if(this->kernel_event_is_set()) + if (this->kernel_event_is_set()) { // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues. return CL_SUCCESS; } cl_event tmp; - cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp); + cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, + num_events_in_wait_list, event_wait_list, &tmp); // Set OpenCL event this->set_cl_kernel_event(tmp); - if(event != nullptr) + if (event != nullptr) { //return cl_event from the intercepted call clRetainEvent(tmp); @@ -209,9 +217,10 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op() /// Please see COMPMID-5934 cl::NDRange gws = kernel.get_cached_gws(); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, - "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", - kernel.config_id().c_str(), to_string(gws).c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL( + arm_compute::logging::LogLevel::INFO, + "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", kernel.config_id().c_str(), + to_string(gws).c_str()); queue_profiler.finish(); @@ -224,7 +233,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat // Construct the list of tuning parameters values to be tested based on the tuner mode. auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws); - for(size_t i = 0; i < tuning_list->size(); ++i) + for (size_t i = 0; i < tuning_list->size(); ++i) { CLTuningParams tuning_test = (*tuning_list)[i]; // Setting the lws @@ -234,19 +243,18 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat auto z = lws_test[2]; const bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1); - if(invalid_lws) + if (invalid_lws) { continue; } kernel.set_lws_hint(lws_test); - if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported()) + if (_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported()) { cl_int wbsm_test = tuning_test.get_wbsm(); kernel.set_wbsm_hint(wbsm_test); } - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, - "[CLTuner] Trying LWS: %s, WBSM: %d", + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "[CLTuner] Trying LWS: %s, WBSM: %d", to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint()); // Run the kernel @@ -260,11 +268,11 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat _kernel_event = nullptr; // Check the execution time - if(diff < min_exec_time) + if (diff < min_exec_time) { min_exec_time = diff; opt_tuning_params.set_lws(tuning_test.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { opt_tuning_params.set_wbsm(tuning_test.get_wbsm()); } @@ -292,30 +300,30 @@ void CLTuner::load_from_file(const std::string &filename) std::ifstream fs; fs.exceptions(std::ifstream::badbit); fs.open(filename, std::ios::in); - if(!fs.is_open()) + if (!fs.is_open()) { ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno); } std::string line; bool header_line = true; - while(!std::getline(fs, line).fail()) + while (!std::getline(fs, line).fail()) { - if(header_line) + if (header_line) { header_line = false; size_t pos_lws = line.find("lws"); size_t pos_wbsm = line.find("wbsm"); _tuning_info.tune_wbsm = false; - if(pos_lws != std::string::npos || pos_wbsm != std::string::npos) + if (pos_lws != std::string::npos || pos_wbsm != std::string::npos) { // The file has in the first line the parameters it has been tuned on - if(pos_wbsm != std::string::npos) + if (pos_wbsm != std::string::npos) { _tuning_info.tune_wbsm = true; } // Once the line with the tuning parameter is read we can // read the next one to start collecting the values - if(std::getline(fs, line).fail()) + if (std::getline(fs, line).fail()) { break; } @@ -324,13 +332,13 @@ void CLTuner::load_from_file(const std::string &filename) CLTuningParams tuning_params; size_t pos = line.find(";"); - if(pos == std::string::npos) + if (pos == std::string::npos) { ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); } std::string kernel_id = line.substr(0, pos); line.erase(0, pos + 1); - if(!tuning_params.from_string(_tuning_info, line)) + if (!tuning_params.from_string(_tuning_info, line)) { ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); } @@ -341,7 +349,7 @@ void CLTuner::load_from_file(const std::string &filename) bool CLTuner::save_to_file(const std::string &filename) const { - if(!_tune_new_kernels || _tuning_params_table.empty() || filename.empty()) + if (!_tune_new_kernels || _tuning_params_table.empty() || filename.empty()) { return false; } @@ -350,16 +358,16 @@ bool CLTuner::save_to_file(const std::string &filename) const fs.open(filename, std::ios::out); std::string header_string = ""; header_string += "lws"; - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { - if(!header_string.empty()) + if (!header_string.empty()) { header_string += " "; } header_string += "wbsm"; } fs << header_string << std::endl; - for(auto const &kernel_data : _tuning_params_table) + for (auto const &kernel_data : _tuning_params_table) { CLTuningParams tun_pams(kernel_data.second); fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl; diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp index 4530537789..bc782c3a2c 100644 --- a/src/runtime/CL/ICLSimpleFunction.cpp +++ b/src/runtime/CL/ICLSimpleFunction.cpp @@ -26,15 +26,14 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" using namespace arm_compute; ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT - : _kernel(), - _border_handler(std::make_unique()), - _ctx(ctx) + : _kernel(), _border_handler(std::make_unique()), _ctx(ctx) { } diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp index da3d4850bf..294396c28a 100644 --- a/src/runtime/CL/Utils.cpp +++ b/src/runtime/CL/Utils.cpp @@ -35,20 +35,20 @@ namespace arm_compute void restore_program_cache_from_file(const std::string &filename) { std::ifstream cache_file(filename, std::ios::binary); - if(cache_file.is_open()) + if (cache_file.is_open()) { - if(!CLScheduler::get().is_initialised()) + if (!CLScheduler::get().is_initialised()) { arm_compute::CLScheduler::get().default_init(); } - while(!cache_file.eof()) + while (!cache_file.eof()) { size_t name_len = 0; size_t binary_len = 0; cache_file.read(reinterpret_cast(&name_len), sizeof(size_t)); cache_file.read(reinterpret_cast(&binary_len), sizeof(size_t)); - if(name_len == 0 || binary_len == 0) + if (name_len == 0 || binary_len == 0) { break; } @@ -60,7 +60,7 @@ void restore_program_cache_from_file(const std::string &filename) tmp.resize(binary_len); cache_file.read(reinterpret_cast(binary.data()), binary_len); cl::Context context = arm_compute::CLScheduler::get().context(); - cl::Program::Binaries binaries{ binary }; + cl::Program::Binaries binaries{binary}; std::vector devices = context.getInfo(); cl::Program program(context, devices, binaries); program.build(); @@ -72,12 +72,12 @@ void restore_program_cache_from_file(const std::string &filename) void save_program_cache_to_file(const std::string &filename) { - if(CLScheduler::get().is_initialised()) + if (CLScheduler::get().is_initialised()) { std::ofstream cache_file(filename, std::ios::binary); - if(cache_file.is_open()) + if (cache_file.is_open()) { - for(const auto &it : CLKernelLibrary::get().get_built_programs()) + for (const auto &it : CLKernelLibrary::get().get_built_programs()) { std::vector> binaries = it.second.getInfo(); ARM_COMPUTE_ERROR_ON(binaries.size() != 1); diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp index f324b1a68c..c035644e4a 100644 --- a/src/runtime/CL/functions/CLActivationLayer.cpp +++ b/src/runtime/CL/functions/CLActivationLayer.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/CL/CLRuntimeContext.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClActivation.h" @@ -35,18 +36,17 @@ namespace arm_compute { struct CLActivationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - CLRuntimeContext *ctx{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + CLRuntimeContext *ctx{nullptr}; + std::unique_ptr op{nullptr}; }; -CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) - : _impl(std::make_unique()) +CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique()) { _impl->ctx = ctx; } -CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default; +CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default; CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default; CLActivationLayer::~CLActivationLayer() = default; @@ -55,7 +55,10 @@ void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, Activatio configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info); } -void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) +void CLActivationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -66,7 +69,8 @@ void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTe _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info); } -Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status +CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) { return opencl::ClActivation::validate(input, output, act_info); } diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp index b30d739025..f9bbd31e8a 100644 --- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp +++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp @@ -27,31 +27,39 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/Utils.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _not_reshaped_output(), _arg_min_max_kernel(), _reshape(), _reduction_axis() + : _memory_group(std::move(memory_manager)), + _not_reshaped_output(), + _arg_min_max_kernel(), + _reshape(), + _reduction_axis() { } CLArgMinMaxLayer::~CLArgMinMaxLayer() = default; -Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) +Status +CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Invalid reduction operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast(TensorShape::num_max_dimensions), + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); DataType output_data_type = DataType::S32; @@ -59,17 +67,18 @@ Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen const auto input_num_channles = input->num_channels(); const auto input_qinfo = input->quantization_info(); - if(output->total_size() != 0) + if (output->total_size() != 0) { output_data_type = output->data_type(); - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } auto shape_before_reshape = input->tensor_shape(); shape_before_reshape.set(axis, 1); - auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) - { + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, + QuantizationInfo qinfo) { ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo); }; @@ -85,20 +94,36 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op); } -void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op) +void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + int axis, + ICLTensor *output, + const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, axis, output, op); _reduction_axis = axis; - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); - - TensorShape not_reshaped_output_shape{ input->info()->tensor_shape() }; + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + DataType output_data_type = + (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + TensorShape not_reshaped_output_shape{input->info()->tensor_shape()}; not_reshaped_output_shape.set(axis, 1); - auto_init_if_empty(*_not_reshaped_output.info(), input->info()->clone()->set_tensor_shape(not_reshaped_output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*_not_reshaped_output.info(), input->info() + ->clone() + ->set_tensor_shape(not_reshaped_output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); _arg_min_max_kernel = std::make_unique(); _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op); diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp index e8affc0853..0c371c4171 100644 --- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp @@ -30,9 +30,8 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" namespace arm_compute { @@ -43,24 +42,40 @@ CLBatchNormalizationLayer::CLBatchNormalizationLayer() CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default; -void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon, +void CLBatchNormalizationLayer::configure(ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, ActivationLayerInfo act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info); } -void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, - const ICLTensor *gamma, float epsilon, - ActivationLayerInfo act_info) +void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info); _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info); } -Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info); } @@ -69,4 +84,4 @@ void CLBatchNormalizationLayer::run() { CLScheduler::get().enqueue(*_norm_kernel, true); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp index d7a409128d..a3798daf61 100644 --- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp @@ -30,14 +30,12 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h" namespace arm_compute { -CLBatchToSpaceLayer::CLBatchToSpaceLayer() - : _batch_to_space_kernel(std::make_unique()) +CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique()) { } @@ -49,29 +47,43 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *blo _batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); } -void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input, block_shape, output); _batch_to_space_kernel->configure(compile_context, input, block_shape, output); } -void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) +void CLBatchToSpaceLayer::configure( + const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info); } -void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + int32_t block_shape_x, + int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info) { ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output); _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info); } -Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { return CLBatchToSpaceLayerKernel::validate(input, block_shape, output); } -Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info); } diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp index a4712ed3f1..7bfd0e3677 100644 --- a/src/runtime/CL/functions/CLBitwiseAnd.cpp +++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include @@ -36,11 +35,14 @@ void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, I configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseAnd::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique(); k->configure(compile_context, input1, input2, output, BitwiseOperation::AND); _kernel = std::move(k); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp index 5964b92447..9763915c02 100644 --- a/src/runtime/CL/functions/CLBitwiseNot.cpp +++ b/src/runtime/CL/functions/CLBitwiseNot.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include @@ -43,4 +42,4 @@ void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLT k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT); _kernel = std::move(k); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp index a07bf17bb2..dd3171b982 100644 --- a/src/runtime/CL/functions/CLBitwiseOr.cpp +++ b/src/runtime/CL/functions/CLBitwiseOr.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include @@ -36,11 +35,14 @@ void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, IC configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseOr::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique(); k->configure(compile_context, input1, input2, output, BitwiseOperation::OR); _kernel = std::move(k); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp index f65e2e406c..5bee4b37ec 100644 --- a/src/runtime/CL/functions/CLBitwiseXor.cpp +++ b/src/runtime/CL/functions/CLBitwiseXor.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include @@ -36,7 +35,10 @@ void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, I configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseXor::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp index 48583bfaf3..76e626fd75 100644 --- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp +++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp @@ -23,18 +23,24 @@ */ #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h" -#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" namespace arm_compute { -void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransform::configure(const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info); } -void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, + const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info); @@ -44,7 +50,10 @@ void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, _kernel = std::move(k); } -Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info); } diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp index 10f7cc2065..42ec8f7ee0 100644 --- a/src/runtime/CL/functions/CLCast.cpp +++ b/src/runtime/CL/functions/CLCast.cpp @@ -26,10 +26,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCast.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCast.h" #include @@ -37,16 +37,15 @@ namespace arm_compute { struct CLCast::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLCast::CLCast() - : _impl(std::make_unique()) +CLCast::CLCast() : _impl(std::make_unique()) { } -CLCast::CLCast(CLCast &&) = default; +CLCast::CLCast(CLCast &&) = default; CLCast &CLCast::operator=(CLCast &&) = default; CLCast::~CLCast() = default; @@ -55,7 +54,10 @@ void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy configure(CLKernelLibrary::get().get_compile_context(), input, output, policy); } -void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy) +void CLCast::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + ConvertPolicy policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, policy); @@ -74,7 +76,7 @@ Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, Con void CLCast::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp index 021f28f238..1ee4789816 100644 --- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp +++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h" namespace arm_compute { @@ -35,7 +35,10 @@ void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups); } -void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups) +void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int num_groups) { ARM_COMPUTE_LOG_PARAMS(input, output, num_groups); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp index 192a266f0f..2f54371e88 100644 --- a/src/runtime/CL/functions/CLComparison.cpp +++ b/src/runtime/CL/functions/CLComparison.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLComparisonKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLComparisonKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" namespace arm_compute { @@ -37,25 +37,33 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation); } -void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +void CLComparison::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation) { ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation); auto k = std::make_unique(); k->configure(compile_context, input1, input2, output, operation); _kernel = std::move(k); - if(output->info()->dimension(0) > 1) + if (output->info()->dimension(0) > 1) { ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - if(broadcasted_info->info()->dimension(0) == 1) + if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), + BorderMode::REPLICATE); } } } -Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation) +Status CLComparison::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation operation) { return CLComparisonKernel::validate(input1, input2, output, operation); } @@ -67,25 +75,30 @@ void CLComparisonStatic::configure(ICLTensor *input1, ICLTensor *input2, IC } template -void CLComparisonStatic::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLComparisonStatic::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { auto k = std::make_unique(); k->configure(compile_context, input1, input2, output, COP); _kernel = std::move(k); - if(output->info()->dimension(0) > 1) + if (output->info()->dimension(0) > 1) { ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - if(broadcasted_info->info()->dimension(0) == 1) + if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), + BorderMode::REPLICATE); } } } template -Status CLComparisonStatic::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +Status +CLComparisonStatic::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) { return CLComparisonKernel::validate(input1, input2, output, COP); } diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp index 0a8884f4e3..9df1c34593 100644 --- a/src/runtime/CL/functions/CLConcatenateLayer.cpp +++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp @@ -24,24 +24,23 @@ #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClConcatenate.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClConcatenate.h" namespace arm_compute { struct CLConcatenateLayer::Impl { std::vector srcs{}; - ICLTensor *dst{ nullptr }; - unsigned int num_inputs{ 0 }; - unsigned int axis{ 0 }; - std::unique_ptr op{ nullptr }; + ICLTensor *dst{nullptr}; + unsigned int num_inputs{0}; + unsigned int axis{0}; + std::unique_ptr op{nullptr}; }; -CLConcatenateLayer::CLConcatenateLayer() - : _impl(std::make_unique()) +CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique()) { } @@ -56,7 +55,10 @@ void CLConcatenateLayer::configure(std::vector &inputs_vector configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); } -void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector &inputs_vector, ICLTensor *output, size_t axis) +void CLConcatenateLayer::configure(const CLCompileContext &compile_context, + std::vector &inputs_vector, + ICLTensor *output, + size_t axis) { ARM_COMPUTE_ERROR_ON(output == nullptr); ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis); @@ -68,7 +70,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std: _impl->op = std::make_unique(); std::vector inputs_vector_info; - for(unsigned int i = 0; i < inputs_vector.size(); ++i) + for (unsigned int i = 0; i < inputs_vector.size(); ++i) { ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i)); inputs_vector_info.emplace_back(inputs_vector.at(i)->info()); @@ -76,7 +78,9 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std: _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis); } -Status CLConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) +Status CLConcatenateLayer::validate(const std::vector &inputs_vector, + const ITensorInfo *output, + size_t axis) { return opencl::ClConcatenate::validate(inputs_vector, output, axis); } @@ -84,7 +88,7 @@ Status CLConcatenateLayer::validate(const std::vector &inpu void CLConcatenateLayer::run() { ITensorPack pack; - for(unsigned i = 0; i < _impl->num_inputs; ++i) + for (unsigned i = 0; i < _impl->num_inputs; ++i) { pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i)); } diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp index 729b973b6a..9d1b368f72 100644 --- a/src/runtime/CL/functions/CLConv3D.cpp +++ b/src/runtime/CL/functions/CLConv3D.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/CL/functions/CLConv3D.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/gpu/cl/operators/ClDirectConv3d.h" namespace arm_compute @@ -32,29 +33,38 @@ using namespace arm_compute::experimental; struct CLConv3D::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLConv3D::CLConv3D() - : _impl(std::make_unique()) +CLConv3D::CLConv3D() : _impl(std::make_unique()) { } CLConv3D::~CLConv3D() = default; -void CLConv3D::configure(const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info) +void CLConv3D::configure(const ICLTensor *src, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *dst, + const Conv3dInfo &conv3d_info) { configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info); } -void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info) +void CLConv3D::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate( + src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info)); _impl->src = src; _impl->weights = weights; @@ -62,10 +72,15 @@ void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTenso _impl->dst = dst; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info); + _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), + _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info); } -Status CLConv3D::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv3dInfo &conv3d_info) +Status CLConv3D::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info); } diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp index b3efe5c8a0..2298f2a669 100644 --- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp +++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp @@ -27,33 +27,37 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" namespace arm_compute { struct CLConvertFullyConnectedWeights::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() - : _impl(std::make_unique()) +CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique()) { } CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default; -void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, + ICLTensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout); } -void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout); @@ -63,8 +67,10 @@ void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_c _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout); } -Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, - DataLayout data_layout) +Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout); } diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index f3c05adb47..7767b45a01 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -28,11 +28,11 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClConv2d.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" namespace arm_compute @@ -43,41 +43,59 @@ struct CLConvolutionLayer::Impl { MemoryGroup memory_group{}; std::shared_ptr memory_manager{}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - std::unique_ptr func{ nullptr }; + std::unique_ptr func{nullptr}; }; -CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_manager = std::move(memory_manager); } CLConvolutionLayer::~CLConvolutionLayer() = default; -void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CLConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, + dilation, act_info, enable_fast_math, num_groups); } -void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CLConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); - ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info, - weights_info, CLScheduler::get().target())) + switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info, + weights_info, CLScheduler::get().target())) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::DIRECT: @@ -85,7 +103,8 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT case ConvolutionMethod::GEMM: { auto f = std::make_unique(); - f->configure(compile_context, input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); + f->configure(compile_context, input->info(), weights->info(), + ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); _impl->op = std::move(f); break; } @@ -101,40 +120,52 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT break; } - if(_impl->op) + if (_impl->op) { _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } } -Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status CLConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); const GPUTarget gpu_target = CLScheduler::get().target(); const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target)) + switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target)) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::DIRECT: case ConvolutionMethod::INDIRECT: case ConvolutionMethod::GEMM: { - ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info)); break; } case ConvolutionMethod::FFT: { // Validate FFT-based convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, + act_info, enable_fast_math)); break; } default: @@ -145,8 +176,15 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo return Status{}; } -ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math) +ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const ActivationLayerInfo &act_info, + const GPUTarget gpu_target, + const Size2D &dilation, + bool enable_fast_math) { const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1); return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target); @@ -158,7 +196,7 @@ void CLConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_impl->memory_group); - if(_impl->func) + if (_impl->func) { _impl->func->run(); } @@ -170,7 +208,7 @@ void CLConvolutionLayer::run() void CLConvolutionLayer::prepare() { - if(_impl->func) + if (_impl->func) { _impl->func->prepare(); } diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp index 56400b67a0..a4f2b0634f 100644 --- a/src/runtime/CL/functions/CLCopy.cpp +++ b/src/runtime/CL/functions/CLCopy.cpp @@ -27,10 +27,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCopy.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCopy.h" #include @@ -38,16 +38,15 @@ namespace arm_compute { struct CLCopy::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLCopy::CLCopy() - : _impl(std::make_unique()) +CLCopy::CLCopy() : _impl(std::make_unique()) { } -CLCopy::CLCopy(CLCopy &&) = default; +CLCopy::CLCopy(CLCopy &&) = default; CLCopy &CLCopy::operator=(CLCopy &&) = default; CLCopy::~CLCopy() = default; diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp index 35ea17cfc2..fc29c43827 100644 --- a/src/runtime/CL/functions/CLCrop.cpp +++ b/src/runtime/CL/functions/CLCrop.cpp @@ -27,10 +27,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCrop.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCrop.h" #include @@ -38,27 +38,38 @@ namespace arm_compute { struct CLCrop::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLCrop::CLCrop() - : _impl(std::make_unique()) +CLCrop::CLCrop() : _impl(std::make_unique()) { } -CLCrop::CLCrop(CLCrop &&) = default; +CLCrop::CLCrop(CLCrop &&) = default; CLCrop &CLCrop::operator=(CLCrop &&) = default; CLCrop::~CLCrop() = default; -void CLCrop::configure(const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) +void CLCrop::configure(const ICLTensor *src, + ICLTensor *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { - configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window); + configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, + dst_window); } -void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) +void CLCrop::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + ICLTensor *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window); @@ -67,10 +78,17 @@ void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor _impl->dst = dst; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, extrapolation_value, dst_window); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, + extrapolation_value, dst_window); } -Status CLCrop::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) +Status CLCrop::validate(const ITensorInfo *input, + const ITensorInfo *output, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window); } diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp index d8fc38d99e..821412b149 100644 --- a/src/runtime/CL/functions/CLCropResize.cpp +++ b/src/runtime/CL/functions/CLCropResize.cpp @@ -25,19 +25,26 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/common/utils/Log.h" - #include namespace arm_compute { namespace { -inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index) +inline void configure_crop(const ICLTensor *input, + ICLTensor *crop_boxes, + ICLTensor *box_ind, + ICLTensor *output, + uint32_t crop_box_ind, + Coordinates &start, + Coordinates &end, + uint32_t &batch_index) { batch_index = *(reinterpret_cast(box_ind->ptr_to_element(Coordinates(crop_box_ind)))); @@ -50,30 +57,48 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers. start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f), std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); - end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f), - std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); - const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast(abs(end[0] - start[0])) + 1, static_cast(abs(end[1] - start[1])) + 1); + end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f), + std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); + const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast(abs(end[0] - start[0])) + 1, + static_cast(abs(end[1] - start[1])) + 1); output->info()->set_tensor_shape(out_shape); } } // namespace CLCropResize::CLCropResize() - : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_functions() + : _input(nullptr), + _boxes(nullptr), + _box_ind(nullptr), + _output(nullptr), + _num_boxes(0), + _method(), + _extrapolation_value(0), + _scale(), + _copy(), + _crop_results(), + _scaled_results(), + _internal_functions() { } CLCropResize::~CLCropResize() = default; -Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) +Status CLCropResize::validate(const ITensorInfo *input, + ITensorInfo *boxes, + ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0); ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA); ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4); ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]); TensorInfo temp_info; - ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value)); - if(output->total_size() > 0) + ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1}, + input->dimension(3) - 1, extrapolation_value)); + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -83,20 +108,34 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen return Status{}; } -void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void CLCropResize::configure(const ICLTensor *input, + ICLTensor *boxes, + ICLTensor *box_ind, + ICLTensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { - configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value); + configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, + extrapolation_value); } -void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void CLCropResize::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *boxes, + ICLTensor *box_ind, + ICLTensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind); - ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), + crop_size, method, extrapolation_value)); ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value); - TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]); + TensorShape output_shape = + TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]); auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32); _num_boxes = boxes->info()->tensor_shape()[1]; @@ -122,7 +161,7 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT // kernels used for cropping and scaling. _boxes->map(CLScheduler::get().queue()); _box_ind->map(CLScheduler::get().queue()); - for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box) + for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box) { auto crop_tensor = std::make_unique(); TensorInfo crop_result_info(1, DataType::F32); @@ -143,7 +182,9 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index); auto scale_kernel = std::make_unique(); - scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT }); + scale_kernel->configure( + compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), + ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT}); _scale.emplace_back(std::move(scale_kernel)); Window win = calculate_max_window(*_output->info()); @@ -159,28 +200,50 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT bool is_width_flipped = end[0] < start[0]; bool is_height_flipped = end[1] < start[1]; /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */ - std::array rows_out_of_bounds{ 0 }; + std::array rows_out_of_bounds{0}; /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */ - std::array cols_out_of_bounds{ 0 }; - if(is_height_flipped) + std::array cols_out_of_bounds{0}; + if (is_height_flipped) { - rows_out_of_bounds[0] = start[1] >= static_cast(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; - rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast(_crop_results[num_box].get()->info()->dimension(2))) : 0; + rows_out_of_bounds[0] = start[1] >= static_cast(_input->info()->dimension(2)) + ? std::min(start[1] - _input->info()->dimension(2) + 1, + _crop_results[num_box].get()->info()->dimension(2)) + : 0; + rows_out_of_bounds[1] = + end[1] < 0 ? std::min(-end[1], static_cast(_crop_results[num_box].get()->info()->dimension(2))) + : 0; } else { - rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast(_crop_results[num_box].get()->info()->dimension(2))) : 0; - rows_out_of_bounds[1] = end[1] >= static_cast(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; + rows_out_of_bounds[0] = + start[1] < 0 + ? std::min(-start[1], static_cast(_crop_results[num_box].get()->info()->dimension(2))) + : 0; + rows_out_of_bounds[1] = end[1] >= static_cast(_input->info()->dimension(2)) + ? std::min(end[1] - _input->info()->dimension(2) + 1, + _crop_results[num_box].get()->info()->dimension(2)) + : 0; } - if(is_width_flipped) + if (is_width_flipped) { - cols_out_of_bounds[0] = start[0] >= static_cast(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; - cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast(_crop_results[num_box].get()->info()->dimension(1))) : 0; + cols_out_of_bounds[0] = start[0] >= static_cast(_input->info()->dimension(1)) + ? std::min(start[0] - _input->info()->dimension(1) + 1, + _crop_results[num_box].get()->info()->dimension(1)) + : 0; + cols_out_of_bounds[1] = + end[0] < 0 ? std::min(-end[0], static_cast(_crop_results[num_box].get()->info()->dimension(1))) + : 0; } else { - cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast(_crop_results[num_box].get()->info()->dimension(1))) : 0; - cols_out_of_bounds[1] = end[0] >= static_cast(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; + cols_out_of_bounds[0] = + start[0] < 0 + ? std::min(-start[0], static_cast(_crop_results[num_box].get()->info()->dimension(1))) + : 0; + cols_out_of_bounds[1] = end[0] >= static_cast(_input->info()->dimension(1)) + ? std::min(end[0] - _input->info()->dimension(1) + 1, + _crop_results[num_box].get()->info()->dimension(1)) + : 0; } Window full_window = calculate_max_window(*_crop_results[num_box].get()->info()); @@ -203,67 +266,84 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds // with the extrapolation value using memset. // First for the rows before the in bounds rows. - if(rows_out_of_bounds[0] > 0) + if (rows_out_of_bounds[0] > 0) { Window slice_fill_rows_before(full_window); slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1)); auto kernel = std::make_unique(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_rows_before); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } Window slice_in(full_window); - slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1)); - slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1)); - - int rows_in_bounds = static_cast(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1]; - if(rows_in_bounds > 0) + slice_in.set(2, + Window::Dimension(rows_out_of_bounds[0], + _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1)); + slice_in.set(1, + Window::Dimension(cols_out_of_bounds[0], + _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1)); + + int rows_in_bounds = static_cast(_crop_results[num_box].get()->info()->dimension(2)) - + rows_out_of_bounds[0] - rows_out_of_bounds[1]; + if (rows_in_bounds > 0) { // Fill all elements that share a row with an in bounds element with the extrapolation value. - if(cols_out_of_bounds[0] > 0) + if (cols_out_of_bounds[0] > 0) { Window slice_fill_cols_before(slice_in); slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1)); auto kernel = std::make_unique(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_cols_before); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } - if(cols_out_of_bounds[1] > 0) + if (cols_out_of_bounds[1] > 0) { Window slice_fill_cols_after(slice_in); - slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1)); + slice_fill_cols_after.set( + 1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], + _crop_results[num_box].get()->info()->dimension(1), 1)); auto kernel = std::make_unique(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_cols_after); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } // Copy all elements within the input bounds from the input tensor. - int cols_in_bounds = static_cast(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1]; - if(cols_in_bounds > 0) + int cols_in_bounds = static_cast(_crop_results[num_box].get()->info()->dimension(1)) - + cols_out_of_bounds[0] - cols_out_of_bounds[1]; + if (cols_in_bounds > 0) { - Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], - is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] }; - Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, - is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 }; + Coordinates2D start_in{ + is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], + is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]}; + Coordinates2D end_in{ + is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, + is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1}; auto kernel = std::make_unique(); - kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in); + kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, + extrapolation_value, &slice_in); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } } // Fill all rows after the in bounds elements with the extrapolation value. - if(rows_out_of_bounds[1] > 0) + if (rows_out_of_bounds[1] > 0) { Window slice_fill_rows_after(full_window); - slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1)); + slice_fill_rows_after.set( + 2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], + _crop_results[num_box].get()->info()->dimension(2), 1)); auto kernel = std::make_unique(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_rows_after); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } @@ -277,18 +357,18 @@ void CLCropResize::run() { ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); - for(unsigned int i = 0; i < _internal_functions.size(); ++i) + for (unsigned int i = 0; i < _internal_functions.size(); ++i) { _internal_functions[i]->run(); } CLScheduler::get().sync(); - for(auto &kernel : _scale) + for (auto &kernel : _scale) { kernel->run(); } CLScheduler::get().sync(); - for(auto &kernel : _copy) + for (auto &kernel : _copy) { kernel->run(); } diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp index 4421a18f2a..e988ab0ac4 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp @@ -25,16 +25,16 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/IClOperator.h" #include "src/gpu/cl/operators/ClTransposedConvolution.h" -#include "src/common/utils/Log.h" - #include #include #include @@ -44,11 +44,11 @@ using namespace arm_compute::misc::shape_calculator; struct CLDeconvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; CLDeconvolutionLayer::~CLDeconvolutionLayer() = default; @@ -58,24 +58,35 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr memor { } -void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +void CLDeconvolutionLayer::configure(ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info); } -void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info); - switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info)) + switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), + deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: { auto op = std::make_unique(); - op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info); + op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, + output->info(), deconv_info); _impl->src = input; _impl->weights = weights; @@ -105,22 +116,28 @@ void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, IC } } -Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +Status CLDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info)) + switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: { // Validate transposed convolution operator - ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info)); break; } case DeconvolutionMethod::UPSCALE_CONV2D: { // Validate direct convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info)); break; } case DeconvolutionMethod::GEMM: @@ -137,12 +154,16 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf return Status{}; } -DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_UNUSED(output, bias, weights_info); - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { return DeconvolutionMethod::UPSCALE_CONV2D; } @@ -154,11 +175,12 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); const size_t ofm = weights->tensor_shape()[idx_n]; - if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second) + if (weights->dimension(idx_w) != deconv_info.stride().first || + weights->dimension(idx_h) != deconv_info.stride().second) { // We observe better performance for FP32 types only when ofm <= 16. // A better heuristic is required for selecting the method for FP16 data types. - if(input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16))) + if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16))) { return DeconvolutionMethod::DIRECT; } @@ -175,7 +197,7 @@ void CLDeconvolutionLayer::run() { prepare(); - if(_impl->op != nullptr) + if (_impl->op != nullptr) { // Optimized Operator will be used ITensorPack pack; @@ -195,7 +217,7 @@ void CLDeconvolutionLayer::run() void CLDeconvolutionLayer::prepare() { - if(_impl->op == nullptr) + if (_impl->op == nullptr) { _function->prepare(); } diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp index 0b428f5b17..b92bf903a6 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp @@ -27,22 +27,21 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTensor.h" -#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" namespace arm_compute { CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT - : _upsample(std::make_unique()), - _fill(), - _output(nullptr) + : _upsample(std::make_unique()), _fill(), _output(nullptr) { } CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default; -Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info) +Status +CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info) { return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info); } @@ -52,13 +51,17 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info) +void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PadStrideInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, info); _output = output; - _fill.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); + _fill.configure(compile_context, _output, + PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); _upsample->configure(compile_context, input, _output, info); } diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp index cac3f51013..6d2fea974e 100644 --- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp +++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp @@ -26,10 +26,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCast.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCast.h" #include @@ -37,16 +37,15 @@ namespace arm_compute { struct CLDepthConvertLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLDepthConvertLayer::CLDepthConvertLayer() - : _impl(std::make_unique()) +CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique()) { } -CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default; +CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default; CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default; CLDepthConvertLayer::~CLDepthConvertLayer() = default; @@ -55,7 +54,11 @@ void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, C configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift); } -void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) +void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + ConvertPolicy policy, + uint32_t shift) { ARM_COMPUTE_UNUSED(shift); ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift); @@ -70,7 +73,8 @@ void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, con _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy); } -Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +Status +CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) { ARM_COMPUTE_RETURN_ERROR_ON(shift != 0); return opencl::ClCast::validate(input, output, policy); @@ -78,7 +82,7 @@ Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo void CLDepthConvertLayer::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp index 98531e7cac..9477c7f81d 100644 --- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h" -#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h" #include @@ -36,7 +35,10 @@ void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, i configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index dcb982fa56..873601bb11 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -29,12 +29,12 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h" #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" -#include "src/common/utils/Log.h" - namespace arm_compute { using namespace arm_compute::misc; @@ -63,25 +63,33 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptrinfo(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output != nullptr ? output->info() : input->info(), - conv_info, - depth_multiplier, - act_info, - dilation)); + ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, + output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); _is_quantized = is_data_type_quantized(input->info()->data_type()); @@ -96,7 +104,7 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; ICLTensor *output_to_use = output; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); _memory_group.manage(&_permuted_output); @@ -119,10 +127,12 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont CLTensor *output_multipliers_to_use = nullptr; CLTensor *output_shifts_to_use = nullptr; - if(_is_quantized) + if (_is_quantized) { - const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); - const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1; + const size_t idx_c = + get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); + const size_t num_filters = + (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1; _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); @@ -132,16 +142,18 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont } // Get the depthwise convolution compute parameters - auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier); + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier); - const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation}; _dwc_native_kernel->set_target(gpu_target); _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, - dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, output_shifts_to_use); + dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, + output_shifts_to_use); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); @@ -151,22 +163,27 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont _permuted_output.allocator()->allocate(); } - if(_is_quantized) + if (_is_quantized) { _output_multipliers.allocator()->allocate(); _output_shifts.allocator()->allocate(); } } -Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) + unsigned int depth_multiplier, + ActivationLayerInfo act_info, + const Size2D &dilation) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); const bool in_place = input == output || output == nullptr; - if(in_place) + if (in_place) { output = input; } @@ -174,21 +191,23 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > + input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > + input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); const GPUTarget gpu_target = CLScheduler::get().target(); - const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation}; const bool needs_permute = input->data_layout() == DataLayout::NCHW; const bool is_quantized = is_data_type_quantized(input->data_type()); TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32)); - if(is_quantized) + if (is_quantized) { - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); @@ -201,40 +220,57 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe } } - if(needs_permute) + if (needs_permute) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout"); TensorShape permuted_input_shape = input->tensor_shape(); TensorShape permuted_weights_shape = weights->tensor_shape(); - const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation }; - TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); + const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation}; + TensorShape permuted_output_shape = + shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC); - const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC); - const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_input = input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_input_shape) + .set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_weights = weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_weights_shape) + .set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_output = output->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_output_shape) + .set_data_layout(DataLayout::NHWC); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); // Get the depthwise convolution compute parameters - auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_native_compute_info = t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier); + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, - dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate( + &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info, + &output_multipliers_shifts_info, &output_multipliers_shifts_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U))); } else { // Get the depthwise convolution compute parameters - auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input, weights, conv_info, dilation, depth_multiplier); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, - &output_multipliers_shifts_info)); + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(input, weights, conv_info, dilation, depth_multiplier); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate( + input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, + &output_multipliers_shifts_info)); } return Status{}; } @@ -245,12 +281,12 @@ void CLDepthwiseConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - if(_needs_permute) + if (_needs_permute) { _permute_input_to_nhwc.run(); } CLScheduler::get().enqueue(*_dwc_native_kernel); - if(_needs_permute) + if (_needs_permute) { _permute_output_to_nchw.run(); } @@ -258,22 +294,21 @@ void CLDepthwiseConvolutionLayer::run() void CLDepthwiseConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { - if(_is_quantized) + if (_is_quantized) { _output_multipliers.map(); _output_shifts.map(); - quantization::compute_quantized_multipliers_and_shifts(_input->info(), - _original_weights->info(), - _output != nullptr ? _output->info() : _input->info(), - reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), - reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); + quantization::compute_quantized_multipliers_and_shifts( + _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(), + reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), + reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); _output_multipliers.unmap(); _output_shifts.unmap(); } - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp index 64c6b5d91c..20162a03db 100644 --- a/src/runtime/CL/functions/CLDequantizationLayer.cpp +++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp @@ -26,22 +26,21 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClDequantize.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClDequantize.h" namespace arm_compute { struct CLDequantizationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLDequantizationLayer::CLDequantizationLayer() - : _impl(std::make_unique()) +CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique()) { } CLDequantizationLayer::~CLDequantizationLayer() = default; @@ -51,7 +50,9 @@ void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output) configure(CLKernelLibrary::get().get_compile_context(), input, output); } -void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +void CLDequantizationLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input, output); _impl->src = input; diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp index 752e0e4a60..d6dae0d732 100644 --- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp @@ -28,37 +28,46 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/operators/ClActivation.h" -#include "src/gpu/cl/operators/ClDirectConv2d.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/operators/ClActivation.h" +#include "src/gpu/cl/operators/ClDirectConv2d.h" namespace arm_compute { struct CLDirectConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLDirectConvolutionLayer::CLDirectConvolutionLayer() - : _impl(std::make_unique()) +CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique()) { } -CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default; +CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default; CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default; CLDirectConvolutionLayer::~CLDirectConvolutionLayer() = default; -void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLDirectConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); } -void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info); @@ -69,10 +78,15 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); } -Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info); @@ -87,4 +101,4 @@ void CLDirectConvolutionLayer::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp index 88c3c6193c..3717f30ae1 100644 --- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp @@ -26,15 +26,15 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/AutoConfiguration.h" -#include "src/common/utils/Log.h" - #include #include @@ -55,11 +55,16 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptrdata_layout(); @@ -70,20 +75,22 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1); - auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info); + auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), + weights->dimension(idx_w), weights->dimension(idx_h), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - if(input->data_type() != weights->data_type()) + if (input->data_type() != weights->data_type()) { - ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || !is_data_type_quantized_asymmetric(input->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || + !is_data_type_quantized_asymmetric(input->data_type())); } - if(bias != nullptr) + if (bias != nullptr) { - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -102,24 +109,39 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen unsigned int deconv_pad_y = 0; const unsigned int stride_x = info.stride().first; const unsigned int stride_y = info.stride().second; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout)); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, + out_dims, deconv_pad_x, deconv_pad_y); + TensorInfo scale_out_info(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); return Status{}; } -void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +void CLDirectDeconvolutionLayer::configure(ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info); } -void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info); @@ -141,15 +163,19 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); - auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info); + auto out_dims = + deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info()); // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info)); _is_prepared = weights_info.retain_internal_weights(); @@ -158,7 +184,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape unsigned int deconv_pad_x = 0; unsigned int deconv_pad_y = 0; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape( + *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); unsigned int deconv_pad_left = pad_right > pad_left ? pad_right - pad_left : 0; unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0; @@ -179,7 +206,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _scaled_output.allocator()->init(scale_out_info); // configure scale function - const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR); + const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, + deconv_pad_bottom, DimensionRoundingType::FLOOR); _scale_f.configure(compile_context, input, &_scaled_output, upsample_info); // Setup the function to convolve the upscaled output @@ -191,7 +219,7 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _flip_axis.allocator()->allocate(); _flip_axis.map(true); auto axis_data = reinterpret_cast(_flip_axis.buffer()); - if(weights->info()->data_layout() == DataLayout::NHWC) + if (weights->info()->data_layout() == DataLayout::NHWC) { axis_data[0] = 1; axis_data[1] = 2; @@ -216,7 +244,7 @@ void CLDirectDeconvolutionLayer::run() void CLDirectDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); @@ -229,7 +257,7 @@ void CLDirectDeconvolutionLayer::prepare() _conv_f.prepare(); // Free flipped weights - if(!_weights_flipped.is_used()) + if (!_weights_flipped.is_used()) { _weights_flipped.allocator()->free(); } diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp index 936b37fb31..d9529f0b7f 100644 --- a/src/runtime/CL/functions/CLElementwiseOperations.cpp +++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp @@ -26,8 +26,8 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClAdd.h" #include "src/gpu/cl/operators/ClElementwiseOperations.h" #include "src/gpu/cl/operators/ClSub.h" @@ -36,26 +36,30 @@ namespace arm_compute { struct CLArithmeticAddition::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLArithmeticAddition::CLArithmeticAddition() - : _impl(std::make_unique()) +CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique()) { } -CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default; +CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default; CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default; CLArithmeticAddition::~CLArithmeticAddition() = default; -void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticAddition::configure( + ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); } -void CLArithmeticAddition::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, +void CLArithmeticAddition::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; @@ -65,7 +69,11 @@ void CLArithmeticAddition::configure(const CLCompileContext &compile_context, co _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info); } -Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CLArithmeticAddition::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return opencl::ClAdd::validate(input1, input2, output, policy, act_info); } @@ -82,26 +90,33 @@ void CLArithmeticAddition::run() struct CLArithmeticSubtraction::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLArithmeticSubtraction::CLArithmeticSubtraction() - : _impl(std::make_unique()) +CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique()) { } -CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default; +CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default; CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default; CLArithmeticSubtraction::~CLArithmeticSubtraction() = default; -void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticSubtraction::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); } -void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, +void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; @@ -111,7 +126,11 @@ void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info); } -Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return opencl::ClSub::validate(input1, input2, output, policy, act_info); } @@ -128,26 +147,32 @@ void CLArithmeticSubtraction::run() struct CLArithmeticDivision::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLArithmeticDivision::CLArithmeticDivision() - : _impl(std::make_unique()) +CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique()) { } -CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default; +CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default; CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default; CLArithmeticDivision::~CLArithmeticDivision() = default; -void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticDivision::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLArithmeticDivision::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticDivision::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -156,7 +181,10 @@ void CLArithmeticDivision::configure(const CLCompileContext &compile_context, co _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLArithmeticDivision::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info); } @@ -173,26 +201,32 @@ void CLArithmeticDivision::run() struct CLElementwiseMax::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLElementwiseMax::CLElementwiseMax() - : _impl(std::make_unique()) +CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique()) { } -CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default; +CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default; CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default; CLElementwiseMax::~CLElementwiseMax() = default; -void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMax::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMax::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -201,7 +235,10 @@ void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTen _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseMax::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseMax::validate(input1, input2, output, act_info); } @@ -218,26 +255,32 @@ void CLElementwiseMax::run() struct CLElementwiseMin::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLElementwiseMin::CLElementwiseMin() - : _impl(std::make_unique()) +CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique()) { } -CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default; +CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default; CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default; CLElementwiseMin::~CLElementwiseMin() = default; -void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMin::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMin::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -246,7 +289,10 @@ void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTen _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseMin::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseMin::validate(input1, input2, output, act_info); } @@ -263,26 +309,32 @@ void CLElementwiseMin::run() struct CLElementwiseSquaredDiff::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() - : _impl(std::make_unique()) +CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique()) { } -CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default; +CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default; CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default; CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff() = default; -void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseSquaredDiff::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -291,7 +343,10 @@ void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info); } @@ -308,26 +363,32 @@ void CLElementwiseSquaredDiff::run() struct CLElementwisePower::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLElementwisePower::CLElementwisePower() - : _impl(std::make_unique()) +CLElementwisePower::CLElementwisePower() : _impl(std::make_unique()) { } -CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default; +CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default; CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default; CLElementwisePower::~CLElementwisePower() = default; -void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwisePower::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwisePower::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -336,7 +397,10 @@ void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLT _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwisePower::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwisePower::validate(input1, input2, output, act_info); } diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp index 9dcd2d1891..3043c26feb 100644 --- a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp +++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClElementwiseUnary.h" @@ -32,17 +33,16 @@ namespace arm_compute { struct CLRsqrtLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLRsqrtLayer::CLRsqrtLayer() - : _impl(std::make_unique()) +CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique()) { } -CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default; +CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default; CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default; CLRsqrtLayer::~CLRsqrtLayer() = default; @@ -74,17 +74,16 @@ void CLRsqrtLayer::run() struct CLExpLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLExpLayer::CLExpLayer() - : _impl(std::make_unique()) +CLExpLayer::CLExpLayer() : _impl(std::make_unique()) { } -CLExpLayer::CLExpLayer(CLExpLayer &&) = default; +CLExpLayer::CLExpLayer(CLExpLayer &&) = default; CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default; CLExpLayer::~CLExpLayer() = default; @@ -116,17 +115,16 @@ void CLExpLayer::run() struct CLNegLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLNegLayer::CLNegLayer() - : _impl(std::make_unique()) +CLNegLayer::CLNegLayer() : _impl(std::make_unique()) { } -CLNegLayer::CLNegLayer(CLNegLayer &&) = default; +CLNegLayer::CLNegLayer(CLNegLayer &&) = default; CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default; CLNegLayer::~CLNegLayer() = default; @@ -157,17 +155,16 @@ void CLNegLayer::run() struct CLSinLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLSinLayer::CLSinLayer() - : _impl(std::make_unique()) +CLSinLayer::CLSinLayer() : _impl(std::make_unique()) { } -CLSinLayer::CLSinLayer(CLSinLayer &&) = default; +CLSinLayer::CLSinLayer(CLSinLayer &&) = default; CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default; CLSinLayer::~CLSinLayer() = default; @@ -198,17 +195,16 @@ void CLSinLayer::run() struct CLAbsLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLAbsLayer::CLAbsLayer() - : _impl(std::make_unique()) +CLAbsLayer::CLAbsLayer() : _impl(std::make_unique()) { } -CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default; +CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default; CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default; CLAbsLayer::~CLAbsLayer() = default; @@ -239,17 +235,16 @@ void CLAbsLayer::run() struct CLLogLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLLogLayer::CLLogLayer() - : _impl(std::make_unique()) +CLLogLayer::CLLogLayer() : _impl(std::make_unique()) { } -CLLogLayer::CLLogLayer(CLLogLayer &&) = default; +CLLogLayer::CLLogLayer(CLLogLayer &&) = default; CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default; CLLogLayer::~CLLogLayer() = default; @@ -280,17 +275,16 @@ void CLLogLayer::run() struct CLRoundLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLRoundLayer::CLRoundLayer() - : _impl(std::make_unique()) +CLRoundLayer::CLRoundLayer() : _impl(std::make_unique()) { } -CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default; +CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default; CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default; CLRoundLayer::~CLRoundLayer() = default; diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp index bd0966b65f..48e9ae824a 100644 --- a/src/runtime/CL/functions/CLFFT1D.cpp +++ b/src/runtime/CL/functions/CLFFT1D.cpp @@ -26,13 +26,13 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" #include "src/core/utils/helpers/fft.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLFFT1D::CLFFT1D(std::shared_ptr memory_manager) @@ -54,7 +54,10 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config) +void CLFFT1D::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const FFT1DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config)); @@ -77,13 +80,14 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32); _digit_reverse_indices.allocator()->init(digit_reverse_indices_info); _memory_group.manage(&_digit_reversed_input); - _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config); + _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, + digit_reverse_config); // Create and configure FFT kernels unsigned int Nx = 1; _num_ffts = decomposed_vector.size(); _fft_kernels.reserve(_num_ffts); - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { const unsigned int radix_for_stage = decomposed_vector.at(i); @@ -93,18 +97,20 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); _fft_kernels.emplace_back(std::make_unique()); - _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, + ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); Nx *= radix_for_stage; } // Configure scale kernel - if(_run_scale) + if (_run_scale) { FFTScaleKernelInfo scale_config; scale_config.scale = static_cast(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; - is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config); + is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) + : _scale_kernel->configure(output, nullptr, scale_config); } // Allocate tensors @@ -123,7 +129,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2); - ARM_COMPUTE_RETURN_ERROR_ON(std::set({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set({0, 1}).count(config.axis) == 0); // Check if FFT is decomposable const auto supported_radix = CLFFTRadixStageKernel::supported_radix(); @@ -132,7 +138,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1); ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2); @@ -151,13 +157,13 @@ void CLFFT1D::run() CLScheduler::get().enqueue(*_digit_reverse_kernel, false); // Run radix kernels - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale); } // Run output scaling - if(_run_scale) + if (_run_scale) { CLScheduler::get().enqueue(*_scale_kernel, true); } diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp index 94fc411355..3857046719 100644 --- a/src/runtime/CL/functions/CLFFT2D.cpp +++ b/src/runtime/CL/functions/CLFFT2D.cpp @@ -26,16 +26,19 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLFFT2D::CLFFT2D(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor() + : _memory_group(memory_manager), + _first_pass_func(memory_manager), + _second_pass_func(memory_manager), + _first_pass_tensor() { } @@ -46,7 +49,10 @@ void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DIn configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config) +void CLFFT2D::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const FFT2DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config)); @@ -88,7 +94,7 @@ Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config)); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp index d12e2de3bf..3894b10785 100644 --- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp @@ -25,10 +25,12 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CPP/CPPScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" @@ -38,8 +40,6 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/utils/helpers/fft.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace @@ -50,11 +50,11 @@ int pad_decomposable(int N) int pad = 0; bool is_decomposed = false; - while(!is_decomposed) + while (!is_decomposed) { const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix); is_decomposed = !decomposed_vector.empty(); - if(!is_decomposed) + if (!is_decomposed) { ++pad; } @@ -104,17 +104,31 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr mem { } -void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void CLFFTConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, + enable_fast_math); } -void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); - ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), + biases != nullptr ? biases->info() : nullptr, + output->info(), conv_info, act_info, enable_fast_math)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math); _original_weights = weights; @@ -124,21 +138,24 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _has_bias = biases != nullptr; // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); - const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), - pad_decomposable(input_dims.y() + kernel_size.y() - 1)); + const Size2D input_dims = + Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); + const Size2D kernel_size = + Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); + const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), + pad_decomposable(input_dims.y() + kernel_size.y() - 1)); // Tensors to use ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; ICLTensor *output_to_use = _has_bias ? &_bias_output : output; // Permute bias - if(biases != nullptr) + if (biases != nullptr) { _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); _permuted_bias.info()->set_data_layout(DataLayout::NCHW); @@ -146,7 +163,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Permute input if needed _needs_permute = input->info()->data_layout() == DataLayout::NHWC; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); // Configure the function to transform the input tensor from NHWC -> NCHW @@ -167,7 +184,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis); // Pad weights - const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } }; + const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}}; _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w); // Transform weights @@ -175,10 +192,10 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo()); // Pad input - const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } }; + const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}}; _memory_group.manage(&_padded_input); _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); } @@ -202,7 +219,8 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _memory_group.manage(&_itransformed_output); FFT2DInfo itranform_info; itranform_info.direction = FFTDirection::Inverse; - _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); + _itransformed_output.allocator()->init( + _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info); _output_reduced.allocator()->allocate(); @@ -214,25 +232,28 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Extract correct region const int start_left = kernel_size.x() - conv_info.pad_left() - 1; const int start_top = kernel_size.y() - conv_info.pad_top() - 1; - const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); - const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); - if(_has_bias) + const int end_right = + _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); + const int end_botton = + _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); + if (_has_bias) { _memory_group.manage(&_bias_output); } - else if(_needs_permute) + else if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); } - _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); + _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, + Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); _itransformed_output.allocator()->allocate(); // Add bias - if(biases != nullptr) + if (biases != nullptr) { output_to_use = output; - if(_needs_permute) + if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); @@ -243,7 +264,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I } // Permute output - if(_needs_permute) + if (_needs_permute) { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); @@ -255,7 +276,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.configure(compile_context, output, nullptr, act_info); } @@ -269,8 +290,13 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _flip_axis.unmap(); } -Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math); @@ -287,24 +313,27 @@ Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn const auto strides = conv_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1); ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y()); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || + conv_info.pad_right() != (kernel_size.x() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || + conv_info.pad_bottom() != (kernel_size.y() / 2)); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x()); } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || + (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); // Validate Activation Layer - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info)); } @@ -320,7 +349,7 @@ void CLFFTConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Transform input - if(_needs_permute) + if (_needs_permute) { _permute_input_func.run(); } @@ -336,17 +365,17 @@ void CLFFTConvolutionLayer::run() _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer()); _extract_output_func.run(); // Add bias - if(_has_bias) + if (_has_bias) { _bias_add_func.run(); } - if(_needs_permute) + if (_needs_permute) { _permute_output_func.run(); } // Run activation layer - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.run(); } @@ -354,10 +383,10 @@ void CLFFTConvolutionLayer::run() void CLFFTConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Permute bias to NCHW - if(_original_bias != nullptr) + if (_original_bias != nullptr) { _permuted_bias.allocator()->allocate(); _permute_bias_func.run(); @@ -366,7 +395,7 @@ void CLFFTConvolutionLayer::prepare() const ICLTensor *cur_weights = _original_weights; // Permute weights - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!cur_weights->is_used()); diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp index 6019a84aba..9bd96a975e 100644 --- a/src/runtime/CL/functions/CLFill.cpp +++ b/src/runtime/CL/functions/CLFill.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClFill.h" @@ -36,16 +37,15 @@ namespace arm_compute { struct CLFill::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLFill::CLFill() - : _impl(std::make_unique()) +CLFill::CLFill() : _impl(std::make_unique()) { } -CLFill::CLFill(CLFill &&) = default; +CLFill::CLFill(CLFill &&) = default; CLFill &CLFill::operator=(CLFill &&) = default; CLFill::~CLFill() = default; @@ -54,7 +54,10 @@ void CLFill::configure(ICLTensor *tensor, const PixelValue &constant_value, Wind configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window); } -void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window) +void CLFill::configure(const CLCompileContext &compile_context, + ICLTensor *tensor, + const PixelValue &constant_value, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp index 32fc37552c..ba1b5372d3 100644 --- a/src/runtime/CL/functions/CLFlattenLayer.cpp +++ b/src/runtime/CL/functions/CLFlattenLayer.cpp @@ -26,8 +26,9 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/gpu/cl/operators/ClFlatten.h" @@ -36,16 +37,15 @@ namespace arm_compute { struct CLFlattenLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLFlattenLayer::CLFlattenLayer() - : _impl(std::make_unique()) +CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique()) { } -CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default; +CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default; CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default; CLFlattenLayer::~CLFlattenLayer() = default; @@ -59,7 +59,8 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _impl->src = input; _impl->dst = output; - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_flatten_shape(input->info()))); _impl->op = std::make_unique(); _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info()); @@ -68,9 +69,10 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); } return opencl::ClFlatten::validate(input, output); @@ -83,4 +85,4 @@ void CLFlattenLayer::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp index 8739e1803e..4322219dd9 100644 --- a/src/runtime/CL/functions/CLFloor.cpp +++ b/src/runtime/CL/functions/CLFloor.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClFloor.h" @@ -34,16 +35,15 @@ namespace arm_compute { struct CLFloor::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLFloor::CLFloor() - : _impl(std::make_unique()) +CLFloor::CLFloor() : _impl(std::make_unique()) { } -CLFloor::CLFloor(CLFloor &&) = default; +CLFloor::CLFloor(CLFloor &&) = default; CLFloor &CLFloor::operator=(CLFloor &&) = default; CLFloor::~CLFloor() = default; diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 1c162db79a..b30f9e701f 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClFullyConnected.h" @@ -35,21 +36,22 @@ using namespace arm_compute::experimental; struct CLFullyConnectedLayer::Impl { MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; - const ITensor *original_weights{ nullptr }; + const ITensor *original_weights{nullptr}; ITensorPack run_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - bool is_prepared{ false }; - bool dynamic_weights{ false }; + bool is_prepared{false}; + bool dynamic_weights{false}; }; -CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr memory_manager, IWeightsManager *weights_manager) +CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); @@ -58,39 +60,45 @@ CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr mem CLFullyConnectedLayer::~CLFullyConnectedLayer() = default; -void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLFullyConnectedLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info); } -void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - fc_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info)); _impl->op = std::make_unique(); _impl->original_weights = weights; _impl->is_prepared = fc_info.retain_internal_weights; - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info); - if(_impl->weights_manager != nullptr) + if (_impl->weights_manager != nullptr) { _impl->weights_manager->manage(_impl->original_weights); } - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } else { @@ -98,14 +106,14 @@ void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, c _impl->run_pack.add_tensor(ACL_DST, output); } - _impl->dynamic_weights = - !weights->info()->are_values_constant() && - fc_info.transpose_weights && - !fc_info.are_weights_reshaped && - !fc_info.retain_internal_weights; + _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights && + !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights; } -Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status CLFullyConnectedLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, FullyConnectedLayerInfo fc_info) { return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info); @@ -113,7 +121,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn void CLFullyConnectedLayer::run() { - if(!_impl->dynamic_weights) + if (!_impl->dynamic_weights) { prepare(); } @@ -124,7 +132,7 @@ void CLFullyConnectedLayer::run() void CLFullyConnectedLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); @@ -133,13 +141,13 @@ void CLFullyConnectedLayer::prepare() _impl->is_prepared = true; // Handle weights managed infrastructure - if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) + if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) { // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare // This is for cases where multiple functions share the same b (weights) // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference const ITensor *original_b = _impl->original_weights; - if(!original_b->is_used()) + if (!original_b->is_used()) { _impl->weights_manager->pre_mark_as_unused(original_b); } diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp index 7379e9d9fe..e4fbf78e13 100644 --- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp +++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp @@ -28,9 +28,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" namespace arm_compute { @@ -41,29 +41,52 @@ CLFuseBatchNormalization::CLFuseBatchNormalization() CLFuseBatchNormalization::~CLFuseBatchNormalization() = default; -void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } -void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, + const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); - _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); + _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, + bn_beta, bn_gamma, epsilon, fbn_type); } -Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } void CLFuseBatchNormalization::run() diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 427ea51ab9..871a1d6e27 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClGemm.h" @@ -40,15 +41,15 @@ using OperatorType = opencl::ClGemm; struct CLGEMM::Impl { - const ICLTensor *b{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *b{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; CLGEMM::CLGEMM(std::shared_ptr memory_manager, IWeightsManager *weights_manager) @@ -60,12 +61,25 @@ CLGEMM::CLGEMM(std::shared_ptr memory_manager, IWeightsManager * CLGEMM::~CLGEMM() = default; -void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure(const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info); } -void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure(const CLCompileContext &compile_context, + const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); @@ -73,25 +87,33 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor _impl->op = std::make_unique(); _impl->is_prepared = gemm_info.retain_internal_weights(); - _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info); + _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), + alpha, beta, gemm_info); _impl->aux_mem_req = _impl->op->workspace(); // Manage/allocate auxilairy tensors - if(_impl->is_prepared) + if (_impl->is_prepared) { _impl->run_pack.add_const_tensor(ACL_SRC_0, a); _impl->run_pack.add_tensor(ACL_DST, output); } else { - _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_2, c }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, _impl->b } }; + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, _impl->b}}; - _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace_tensors = + manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); } } -Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status CLGEMM::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info); } @@ -107,15 +129,15 @@ void CLGEMM::run() void CLGEMM::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->b->mark_as_unused(); } diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index c8c18f35db..aef7cddd7a 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -27,10 +27,11 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClGemmConv2d.h" #include "support/Cast.h" @@ -47,18 +48,19 @@ using namespace arm_compute::experimental; struct CLGEMMConvolutionLayer::Impl { - const ITensor *weights{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *weights{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; -CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr memory_manager, IWeightsManager *weights_manager) +CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(memory_manager); @@ -67,40 +69,60 @@ CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr m CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default; -void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +void CLGEMMConvolutionLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, + dilation, act_info, num_groups); } -void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); _impl->weights = weights; _impl->op = std::make_unique(); const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, input }, - { TensorType::ACL_SRC_1, weights }, - { TensorType::ACL_SRC_2, biases }, - { TensorType::ACL_DST, output } - }; - _impl->prep_pack = - { - { TensorType::ACL_SRC_1, weights }, - { TensorType::ACL_SRC_2, biases }, + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + {TensorType::ACL_DST, output}}; + _impl->prep_pack = { + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, }; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups); return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info); @@ -115,14 +137,14 @@ void CLGEMMConvolutionLayer::run() void CLGEMMConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->weights->mark_as_unused(); } diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp index 9fc81c11da..7d40cf1829 100644 --- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp @@ -24,15 +24,15 @@ #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include @@ -40,12 +40,13 @@ namespace arm_compute { namespace { -std::pair compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw) +std::pair +compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw) { Coordinates start; Coordinates end; - if(is_nchw) + if (is_nchw) { start.set(0, deconv_info.pad_left()); start.set(1, deconv_info.pad_top()); @@ -63,13 +64,16 @@ std::pair compute_start_end_slice_coordinates(const IT end.set(2, output_info.dimension(2) - deconv_info.pad_bottom()); } - return { start, end }; + return {start, end}; } -Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info) +Status construct_gemmlowp_output_stage(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + GEMMLowpOutputStageInfo &output_stage_info) { const auto data_type = input->data_type(); - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); @@ -78,7 +82,8 @@ Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorIn float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; int output_multiplier(0); int output_shift(0); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; output_stage_info.gemmlowp_multiplier = output_multiplier; @@ -122,15 +127,21 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptrdata_layout(); - const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; + const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || + deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; const bool is_nchw = input->data_layout() == DataLayout::NCHW; const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); @@ -144,21 +155,31 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso TensorShape nhwc_weights_shape = weights->tensor_shape(); TensorShape nhwc_input_shape = input->tensor_shape(); - if(is_nchw) + if (is_nchw) { permute(nhwc_weights_shape, PermutationVector(2, 0, 1)); permute(nhwc_input_shape, PermutationVector(2, 0, 1)); - TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW); + TensorInfo nhwc_input_info = input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(nhwc_input_shape) + .set_data_layout(DataLayout::NCHW); - TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW); + TensorInfo nhwc_weights_info = weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(nhwc_weights_shape) + .set_data_layout(DataLayout::NCHW); CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1)); CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1)); } - const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]); - const TensorInfo reshaped_info = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true); + const TensorShape reshaped_shape = + TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]); + const TensorInfo reshaped_info = + weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info)); TensorShape transposed_shape(reshaped_shape[1], reshaped_shape[0]); @@ -166,77 +187,95 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info)); TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b), - input->dimension(idx_w), - input->dimension(idx_h), - input->dimension(idx_b)); + input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b)); TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true); GEMMInfo gemm_info(false, false, true, input->dimension(idx_h), true); GEMMLowpOutputStageInfo output_stage_info; - if(is_quantized) + if (is_quantized) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32), - gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate( + &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, + &gemm_output_info.set_data_type(DataType::S32), gemm_info)); ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), + &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info)); } const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info); - const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); - TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); + auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), + weights->dimension(idx_w), weights->dimension(idx_h), stride_info); + const TensorShape deconv_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); + TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); - if(padded_input && is_quantized) + if (padded_input && is_quantized) { const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw); - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate( + &col2im_output_info, nullptr, + &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), + output, start_end.first, start_end.second)); } - else if(padded_input) + else if (padded_input) { const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw); - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second)); } - else if(is_quantized) + else if (is_quantized) { - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info)); } return Status{}; } -void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info) +void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info); } -void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const PadStrideInfo &deconv_info) +void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(), - weights->info(), - bias != nullptr ? bias->info() : nullptr, - output->info(), - deconv_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate( + input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info); _original_weights = weights; - _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; - _is_nchw = input->info()->data_layout() == DataLayout::NCHW; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || + deconv_info.pad_top() > 0; + _is_nchw = input->info()->data_layout() == DataLayout::NCHW; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); const ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; @@ -245,7 +284,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context // do an outer product in NCHW and then an accumulation through a reduction. This would have two // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction // might be slower than GEMM. - if(_is_nchw) + if (_is_nchw) { _memory_group.manage(&_permuted_input); _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U)); @@ -257,10 +296,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } // Reshape the input weights. The weights will be reshaped only once during the call to prepare() - _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0), - weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)), - 1, - input->info()->data_type(), weights->info()->quantization_info())); + _reshaped_weights.allocator()->init( + TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) * + weights_to_use->info()->dimension(2) * + weights_to_use->info()->dimension(3)), + 1, input->info()->data_type(), weights->info()->quantization_info())); _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights); _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t); @@ -269,15 +309,17 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context GEMMInfo gemm_info(false, false, true, input->info()->dimension(idx_h), true); // Configure output stage for asymmetric quantized types - if(_is_quantized) + if (_is_quantized) { // gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original // and restore them back to make it work properly. QuantizationInfo iq_info = input->info()->quantization_info(); QuantizationInfo wq_info = weights->info()->quantization_info(); - input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset)); - _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset)); + input_to_use->info()->set_quantization_info( + QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset)); + _reshaped_weights_t.info()->set_quantization_info( + QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset)); _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info); @@ -286,10 +328,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } else { - _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info); + _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, + gemm_info); } - if(_is_nchw) + if (_is_nchw) { _permuted_input.allocator()->allocate(); } @@ -298,7 +341,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context ICLTensor *slice_output = nullptr; ICLTensor *output_stage_output = nullptr; - if(_padded_input && _is_quantized) + if (_padded_input && _is_quantized) { _memory_group.manage(&_slice_gemm_input); _memory_group.manage(&_gemmlowp_final); @@ -306,13 +349,13 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context output_stage_output = &_slice_gemm_input; slice_output = output; } - else if(_padded_input) + else if (_padded_input) { _memory_group.manage(&_slice_gemm_input); deconv_reshape_output = &_slice_gemm_input; slice_output = output; } - else if(_is_quantized) + else if (_is_quantized) { _memory_group.manage(&_gemmlowp_final); deconv_reshape_output = &_gemmlowp_final; @@ -324,21 +367,24 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } // Configure a Col2Im call to reshape the output of GEMM - _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info); + _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), + weights->info(), deconv_info); _gemm_output.allocator()->allocate(); - if(_is_quantized) + if (_is_quantized) { GEMMLowpOutputStageInfo output_stage_info; construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info); - _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info); + _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, + output_stage_info); _gemmlowp_final.allocator()->allocate(); } // If the input was padded, the output needs to be sliced. - if(_padded_input) + if (_padded_input) { - const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw); + const auto start_end = + compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw); _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second); _slice_gemm_input.allocator()->allocate(); } @@ -350,12 +396,12 @@ void CLGEMMDeconvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - if(_is_nchw) + if (_is_nchw) { _permute_input_to_nhwc.run(); } - if(_is_quantized) + if (_is_quantized) { _mm_gemmlowp.run(); } @@ -366,12 +412,12 @@ void CLGEMMDeconvolutionLayer::run() CLScheduler::get().enqueue(*_deconv_reshape, false); - if(_is_quantized) + if (_is_quantized) { _gemmlowp_output_stage.run(); } - if(_padded_input) + if (_padded_input) { _slice_gemm.run(); } @@ -379,11 +425,11 @@ void CLGEMMDeconvolutionLayer::run() void CLGEMMDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - if(_is_nchw) + if (_is_nchw) { _permuted_weights.allocator()->allocate(); _permute_weights_to_nhwc.run(); @@ -392,7 +438,7 @@ void CLGEMMDeconvolutionLayer::prepare() _reshaped_weights.allocator()->allocate(); _reshape_weights.run(); - if(_is_nchw) + if (_is_nchw) { _permuted_weights.allocator()->free(); } @@ -401,7 +447,7 @@ void CLGEMMDeconvolutionLayer::prepare() _transpose_weights.run(); // Prepare gemm - if(!_is_quantized) + if (!_is_quantized) { _mm_gemm.prepare(); } @@ -411,7 +457,7 @@ void CLGEMMDeconvolutionLayer::prepare() } // Free resources - if(!_reshaped_weights_t.is_used()) + if (!_reshaped_weights_t.is_used()) { _reshaped_weights_t.allocator()->free(); } diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index d9029478a1..8bad198658 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -31,12 +31,12 @@ #include "arm_compute/core/Log.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/IMemoryManager.h" -#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" namespace arm_compute @@ -46,13 +46,13 @@ using OperatorType = opencl::ClGemmLowpMatrixMultiplyCore; struct CLGEMMLowpMatrixMultiplyCore::Impl { - const ICLTensor *b{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *b{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) @@ -63,12 +63,18 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptrop = std::make_unique(); _impl->is_prepared = gemm_info.retain_internal_weights(); - _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info); + _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), + gemm_info); _impl->aux_mem_req = _impl->op->workspace(); // Manage/allocate auxilairy tensors - if(_impl->is_prepared) + if (_impl->is_prepared) { _impl->run_pack.add_const_tensor(ACL_SRC_0, a); _impl->run_pack.add_tensor(ACL_DST, output); } else { - _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_1, _impl->b }, { ACL_SRC_2, c }, { ACL_DST, output } }; - _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}}; + _impl->workspace_tensors = + manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); } } -Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { return OperatorType::validate(a, b, c, output, gemm_info); } @@ -108,7 +120,7 @@ void CLGEMMLowpMatrixMultiplyCore::run() void CLGEMMLowpMatrixMultiplyCore::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp index 6feed0d713..3dd8c5f101 100644 --- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp @@ -40,27 +40,33 @@ namespace arm_compute { struct CLGEMMLowpOutputStage::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *bias{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *bias{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; }; -CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() - : _impl(std::make_unique()) +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique()) { } -CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default; +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default; CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default; CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage() = default; -void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) +void CLGEMMLowpOutputStage::configure(const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const GEMMLowpOutputStageInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info); } -void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) +void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -69,11 +75,15 @@ void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, c _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info); - _impl->run_pack = { { ACL_SRC, _impl->src }, { ACL_BIAS, _impl->bias }, { ACL_DST, _impl->dst } }; + _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), + info); + _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}}; } -Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info) +Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info) { return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info); } diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp index 033c117cec..2610cb1a3b 100644 --- a/src/runtime/CL/functions/CLGather.cpp +++ b/src/runtime/CL/functions/CLGather.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLGather.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "src/core/CL/kernels/CLGatherKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLGatherKernel.h" namespace arm_compute { @@ -35,7 +35,11 @@ void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTe configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis); } -void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) +void CLGather::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + int axis) { ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp index 9cb7d618cf..b2c1d2631e 100644 --- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp +++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp @@ -27,13 +27,13 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h" #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h" #include "src/core/CL/kernels/CLPadLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr memory_manager) @@ -71,48 +71,67 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptrinfo(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), + proposals->info(), scores_out->info(), + num_valid_proposals->info(), info)); ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC; const DataType scores_data_type = scores->info()->data_type(); _is_qasymm8 = scores_data_type == DataType::QASYMM8; - const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); - const int total_num_anchors = num_anchors * feat_width * feat_height; - const int pre_nms_topN = info.pre_nms_topN(); - const int post_nms_topN = info.post_nms_topN(); - const size_t values_per_roi = info.values_per_roi(); + const int num_anchors = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); + const int total_num_anchors = num_anchors * feat_width * feat_height; + const int pre_nms_topN = info.pre_nms_topN(); + const int post_nms_topN = info.post_nms_topN(); + const size_t values_per_roi = info.values_per_roi(); const QuantizationInfo scores_qinfo = scores->info()->quantization_info(); const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type; - const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); + const QuantizationInfo rois_qinfo = + (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); // Compute all the anchors _memory_group.manage(&_all_anchors); - _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); + _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, + ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); - _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); + _deltas_flattened.allocator()->init( + TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); // Permute and reshape deltas _memory_group.manage(&_deltas_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); + _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1}); _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } @@ -126,10 +145,10 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context // Permute and reshape scores _memory_group.manage(&_scores_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); + _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1}); _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } @@ -140,7 +159,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context CLTensor *anchors_to_use = &_all_anchors; CLTensor *deltas_to_use = &_deltas_flattened; - if(_is_qasymm8) + if (_is_qasymm8) { _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32)); _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32)); @@ -163,11 +182,12 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context anchors_to_use->allocator()->allocate(); _all_proposals_to_use = &_all_proposals; - if(_is_qasymm8) + if (_is_qasymm8) { _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset - _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); + _all_proposals_quantized.allocator()->init( + TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; @@ -183,7 +203,8 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context // Note that NMS needs outputs preinitialized. auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo); - auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo); + auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, + rois_qinfo); auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32); // Initialize temporaries (unused) outputs @@ -195,20 +216,27 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context _num_valid_proposals = num_valid_proposals; _memory_group.manage(&_proposals_4_roi_values); - _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals, - BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height())); + _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, + &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals, + BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, + true, min_size_scaled, info.im_width(), info.im_height())); _keeps_nms_unused.allocator()->allocate(); _classes_nms_unused.allocator()->allocate(); _all_proposals_to_use->allocator()->allocate(); _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}}); _proposals_4_roi_values.allocator()->allocate(); } -Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, - const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info) +Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, + const ITensorInfo *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -216,9 +244,12 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas); - const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); + const int num_anchors = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); const int num_images = scores->dimension(3); const int total_num_anchors = num_anchors * feat_width * feat_height; const int values_per_roi = info.values_per_roi(); @@ -227,76 +258,101 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16); const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f); } - TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); - - TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true); - TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); - if(scores->data_layout() == DataLayout::NHWC) + TensorInfo all_anchors_info( + anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate( + anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); + + TensorInfo deltas_permuted_info = + deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)) + .set_is_resizable(true); + TensorInfo scores_permuted_info = + scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); + if (scores->data_layout() == DataLayout::NHWC) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 })); - ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 })); + ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1})); + ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1})); } - TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo deltas_flattened_info( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info)); - TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); - TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo scores_flattened_info( + scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); + TensorInfo proposals_4_roi_values( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info)); TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values; - TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0)); - if(is_qasymm8) + TensorInfo proposals_4_roi_values_quantized( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16) + .set_quantization_info(QuantizationInfo(0.125f, 0)); + if (is_qasymm8) { - TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); + TensorInfo all_anchors_f32_info(anchors->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info)); - TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); - - TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); - - ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); + TensorInfo deltas_flattened_f32_info(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); + + TensorInfo proposals_4_roi_values_f32(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate( + &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + + ARM_COMPUTE_RETURN_ON_ERROR( + CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized; } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); } - ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}})); - if(num_valid_proposals->total_size() > 0) + if (num_valid_proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32); } - if(proposals->total_size() > 0) + if (proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors)); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16); const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform(); @@ -309,7 +365,7 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens } } - if(scores_out->total_size() > 0) + if (scores_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors)); @@ -356,7 +412,7 @@ void CLGenerateProposalsLayer::run() CLScheduler::get().enqueue(*_compute_anchors_kernel, false); // Transpose and reshape the inputs - if(!_is_nhwc) + if (!_is_nhwc) { _permute_deltas.run(); _permute_scores.run(); @@ -364,7 +420,7 @@ void CLGenerateProposalsLayer::run() _flatten_deltas.run(); _flatten_scores.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _dequantize_anchors->run(); _dequantize_deltas->run(); @@ -373,7 +429,7 @@ void CLGenerateProposalsLayer::run() // Build the boxes CLScheduler::get().enqueue(*_bounding_box_kernel, false); - if(_is_qasymm8) + if (_is_qasymm8) { _quantize_all_proposals->run(); } diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp index 90af36aa77..1a2369c5c2 100644 --- a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp @@ -26,36 +26,45 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" -#include "src/gpu/cl/operators/ClIndirectConv2d.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/operators/ClIndirectConv2d.h" namespace arm_compute { struct CLIndirectConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() - : _impl(std::make_unique()) +CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique()) { } -CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default; +CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default; CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default; CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer() = default; -void CLIndirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLIndirectConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); } -void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info); @@ -65,10 +74,15 @@ void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_conte _impl->biases = biases; _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); } -Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info); @@ -83,4 +97,4 @@ void CLIndirectConvolutionLayer::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp index 5feafe19db..0e994e1aee 100644 --- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp @@ -27,50 +27,62 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT - : _inst_norm_kernel(), - _mean_var_kernel(), - _mean_var_tensor(), - _ctx(ctx) + : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx) { } CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer() { } -void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +void CLInstanceNormalizationLayer::configure( + ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) { configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision); } -void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision) { ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision); auto w = std::make_unique(); w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision); _mean_var_kernel = std::move(w); auto k = std::make_unique(); - k->configure(compile_context, input, &_mean_var_tensor, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); + k->configure(compile_context, input, &_mean_var_tensor, output, + InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); _inst_norm_kernel = std::move(k); _mean_var_tensor.allocator()->allocate(); } -Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision) { - return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); + return CLInstanceNormalizationLayerKernel::validate( + input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); } void CLInstanceNormalizationLayer::run() { - ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, "The child class didn't set the CL kernel or function isn't configured"); + ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, + "The child class didn't set the CL kernel or function isn't configured"); schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get()); schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get()); } diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp index 1278385f53..4fe1d9b20b 100644 --- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp +++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp @@ -29,12 +29,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace @@ -57,7 +57,8 @@ void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon); } -void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon) +void CLL2NormalizeLayer::configure( + const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon) { ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon); @@ -86,7 +87,8 @@ Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo sum_sq.set_tensor_shape(shape); const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); // Reduce shape on axis shape.set(actual_axis, 1); diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index ea08beca75..3b50234c77 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -24,15 +24,15 @@ #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/gpu/cl/kernels/ClTransposeKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/gpu/cl/kernels/ClTransposeKernel.h" namespace arm_compute { @@ -40,54 +40,155 @@ using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::utils::info_helpers; CLLSTMLayer::CLLSTMLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(), - _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), - _transpose_cell_state(std::make_unique()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), - _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), - _fully_connected_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), - _concat_weights_input_gate(), _concat_weights_output(), _ones_fill(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), - _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), - _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), - _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), - _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), - _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), - _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false) + : _memory_group(std::move(memory_manager)), + _fully_connected_input_gate(), + _accum_input_gate1(), + _subtract_input_gate(), + _pixelwise_mul_input_gate(), + _activation_input_gate(), + _fully_connected_forget_gate(), + _accum_forget_gate1(), + _pixelwise_mul_forget_gate(), + _activation_forget_gate(), + _fully_connected_cell_state(), + _gemm_cell_state1(), + _transpose_cell_state(std::make_unique()), + _accum_cell_state1(), + _accum_cell_state2(), + _pixelwise_mul_cell_state1(), + _activation_cell_state(), + _cell_clip(), + _pixelwise_mul_cell_state2(), + _fully_connected_output(), + _pixelwise_mul_output_state1(), + _accum_output1(), + _activation_output(), + _activation_output_state(), + _pixelwise_mul_output_state2(), + _fully_connected_output_state(), + _projection_clip(), + _copy_cell_state(), + _copy_output(), + _concat_scratch_buffer(), + _concat_inputs_forget_gate(), + _concat_weights_forget_gate(), + _concat_weights_input_gate(), + _concat_weights_output(), + _ones_fill(), + _mean_std_norm_input_gate(), + _pixelwise_mul_input_gate_coeff(), + _accum_input_gate_bias(), + _mean_std_norm_forget_gate(), + _pixelwise_mul_forget_gate_coeff(), + _accum_forget_gate_bias(), + _mean_std_norm_cell_gate(), + _pixelwise_mul_cell_gate_coeff(), + _accum_cell_gate_bias(), + _mean_std_norm_output_gate(), + _pixelwise_mul_output_gate_coeff(), + _accum_output_gate_bias(), + _input_gate_out1(), + _input_gate_out2(), + _input_gate_out3(), + _input_gate_out4(), + _forget_gate_out1(), + _forget_gate_out2(), + _forget_gate_out3(), + _forget_gate_out4(), + _forget_gate_out5(), + _forget_gate_out6(), + _cell_state_out1(), + _cell_state_out2(), + _cell_state_out3(), + _cell_state_out4(), + _cell_state_out5(), + _output1(), + _output2(), + _output3(), + _output4(), + _cell_state_activation(), + _output_state1(), + _ones(), + _input_layer_norm_out1(), + _input_layer_norm_out2(), + _forget_layer_norm_out1(), + _forget_layer_norm_out2(), + _cell_layer_norm_out1(), + _cell_layer_norm_out2(), + _output_layer_norm_out1(), + _output_layer_norm_out2(), + _run_peephole_opt(false), + _run_cifg_opt(false), + _perform_cell_clipping(false), + _has_projection_weights(false), + _perform_projection_clipping(false), + _is_prepared(false), + _is_layer_norm_lstm(false) { } CLLSTMLayer::~CLLSTMLayer() = default; -void CLLSTMLayer::configure(const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *output_state_in, ICLTensor *cell_state_in, - ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void CLLSTMLayer::configure(const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, + ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, + ICLTensor *output_state_out, + ICLTensor *cell_state_out, + ICLTensor *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, + cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, cell_threshold, projection_threshold); } -void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *output_state_in, ICLTensor *cell_state_in, - ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void CLLSTMLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, + ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, + ICLTensor *output_state_out, + ICLTensor *cell_state_out, + ICLTensor *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); - ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, - output, lstm_params, activation_info, cell_threshold, projection_threshold); + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, + scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + cell_threshold, projection_threshold); _is_layer_norm_lstm = lstm_params.use_layer_norm(); @@ -96,13 +197,12 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(), - input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - output_state_in->info(), cell_state_in->info(), - scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), - lstm_params_info, activation_info, cell_threshold, projection_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(), + cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), + lstm_params_info, activation_info, cell_threshold, projection_threshold)); const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape(); // Configure block that calculates the forget gate @@ -126,26 +226,31 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe weights_vector.emplace_back(input_to_forget_weights); weights_vector.emplace_back(recurrent_to_forget_weights); - const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); + const TensorShape weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type())); _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX); _memory_group.manage(&_forget_gate_out5); - _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); + _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, + (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); _memory_group.manage(&_forget_gate_out1); _memory_group.manage(&_forget_gate_out3); _forget_gate_out6.allocator()->allocate(); CLTensor *forget_gate_out = &_forget_gate_out5; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _run_peephole_opt = true; _memory_group.manage(&_forget_gate_out4); - _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); + _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), + &_forget_gate_out4, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); + _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, + ConvertPolicy::SATURATE); _forget_gate_out4.allocator()->allocate(); _forget_gate_out5.allocator()->allocate(); forget_gate_out = &_forget_gate_out3; @@ -154,22 +259,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe { _forget_gate_out3.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_layer_norm_out1); _memory_group.manage(&_forget_layer_norm_out2); _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out); - _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, + lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before forget_gate_out->allocator()->allocate(); - _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, + &_forget_layer_norm_out2, ConvertPolicy::SATURATE); _forget_layer_norm_out1.allocator()->allocate(); forget_gate_out = &_forget_layer_norm_out2; } - _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG @@ -178,12 +286,13 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); CLTensor *input_gate_out = &_input_gate_out1; - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type())); - _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE); + _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, + ConvertPolicy::SATURATE); _ones.allocator()->allocate(); _run_cifg_opt = true; } @@ -195,7 +304,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe std::vector lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type())); _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX); @@ -203,15 +313,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_input_gate_out1); _memory_group.manage(&_input_gate_out3); - _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); + _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, + (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), + &_input_gate_out3); _input_gate_out2.allocator()->allocate(); input_gate_out = &_input_gate_out3; - if(_run_peephole_opt) + if (_run_peephole_opt) { _memory_group.manage(&_input_gate_out4); - _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), + &_input_gate_out4, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); + _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, + ConvertPolicy::SATURATE); _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); input_gate_out = &_input_gate_out1; @@ -221,22 +336,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _input_gate_out1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_input_layer_norm_out1); _memory_group.manage(&_input_layer_norm_out2); _mean_std_norm_input_gate.configure(compile_context, input_gate_out); - _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, + lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before input_gate_out->allocator()->allocate(); - _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), + &_input_layer_norm_out2, ConvertPolicy::SATURATE); _input_layer_norm_out1.allocator()->allocate(); input_gate_out = &_input_layer_norm_out2; } - _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_input_gate.configure(compile_context, input_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); } // Configure block that calculates the cell state @@ -249,44 +367,54 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_out1); - _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); + _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, + (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); _memory_group.manage(&_cell_state_out2); _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info()); _recurrent_to_cell_weights = recurrent_to_cell_weights; _memory_group.manage(&_cell_state_out3); - _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f); + _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, + 0.f); _cell_state_out2.allocator()->allocate(); _memory_group.manage(&_cell_state_out4); - _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); + _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, + ConvertPolicy::SATURATE); CLTensor *cell_state_out_ptr = &_cell_state_out4; - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_layer_norm_out1); _memory_group.manage(&_cell_layer_norm_out2); _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr); - _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, + lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before cell_state_out_ptr->allocator()->allocate(); - _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, + ConvertPolicy::SATURATE); _cell_layer_norm_out1.allocator()->allocate(); cell_state_out_ptr = &_cell_layer_norm_out2; } _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info); _memory_group.manage(&_cell_state_out5); - _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); cell_state_out_ptr->allocator()->allocate(); - _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, + ConvertPolicy::SATURATE); _cell_state_out3.allocator()->allocate(); _cell_state_out5.allocator()->allocate(); // Perform clipping - if(cell_threshold != 0.f) + if (cell_threshold != 0.f) { _perform_cell_clipping = true; - _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold)); + _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold)); } // Configure block that calculates the output @@ -298,7 +426,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe std::vector in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type())); _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX); @@ -306,18 +435,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_output1); _memory_group.manage(&_output4); - _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); + _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, + (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); _output2.allocator()->allocate(); _forget_gate_out2.allocator()->allocate(); CLTensor *output_gate_out = &_output4; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); _memory_group.manage(&_output3); - _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), + &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE); _output4.allocator()->allocate(); output_gate_out = &_output1; @@ -329,22 +460,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe { _output1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_output_layer_norm_out1); _memory_group.manage(&_output_layer_norm_out2); _mean_std_norm_output_gate.configure(compile_context, output_gate_out); - _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, + lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before output_gate_out->allocator()->allocate(); - _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, + &_output_layer_norm_out2, ConvertPolicy::SATURATE); _output_layer_norm_out1.allocator()->allocate(); output_gate_out = &_output_layer_norm_out2; } - _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_output.configure(compile_context, output_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the output state /** lstm_res = PixelwiseMul(output, Activation(cell_state)) @@ -361,19 +495,24 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_cell_state_activation); _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info); - _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, + output_state_out_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); _cell_state_activation.allocator()->allocate(); - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { _has_projection_weights = true; - _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); + _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out); _output_state1.allocator()->allocate(); // Perform clipping - if(projection_threshold != 0.f) + if (projection_threshold != 0.f) { _perform_projection_clipping = true; - _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); + _projection_clip.configure(compile_context, output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -projection_threshold, projection_threshold)); } } @@ -383,7 +522,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe // Vector for holding the tensors to store in scratch buffer std::vector scratch_inputs; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { scratch_inputs.emplace_back(input_gate_out); } @@ -397,29 +536,38 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe output_gate_out->allocator()->allocate(); } -Status CLLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +Status CLLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check dimensions ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); @@ -438,16 +586,16 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) - && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && + cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); const unsigned int num_batches = input->dimension(1); const unsigned int num_cells = input_to_output_weights->dimension(1); - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { // If CIFG is used, input layer normalization weights tensor is omitted - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr); } @@ -459,8 +607,12 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights()); } - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1); @@ -470,7 +622,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, } // Check peephole optimization - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1); @@ -488,36 +640,42 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, TensorInfo cell_state_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type()); // Validate forget gate - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); std::vector inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate input gate - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), - lstm_params.recurrent_to_input_weights(), - lstm_params.input_gate_bias()); + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1); @@ -525,88 +683,121 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, std::vector lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); - TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, lstm_params.input_to_input_weights(), + (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), + &input_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(lstm_params.use_layer_norm()) + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(cell_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (cell_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, - -cell_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&cell_state_tmp, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold))); } std::vector in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); - TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX)); // Validate output gate tmp - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate output state ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - if(lstm_params.has_projection()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out)); - if(projection_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, + 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN)); + if (lstm_params.has_projection()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out)); + if (projection_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, output_state_out, - ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + output_state_out, output_state_out, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, + projection_threshold))); } } @@ -616,7 +807,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, // Validate scratch concatenation std::vector inputs_vector_info_raw; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { inputs_vector_info_raw.push_back(&input_gate); } @@ -638,12 +829,12 @@ void CLLSTMLayer::run() _fully_connected_forget_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_forget_gate.run(); _accum_forget_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_forget_gate.run(); _pixelwise_mul_forget_gate_coeff.run(); @@ -651,7 +842,7 @@ void CLLSTMLayer::run() } _activation_forget_gate.run(); - if(_run_cifg_opt) + if (_run_cifg_opt) { _ones_fill.run(); _subtract_input_gate.run(); @@ -660,13 +851,13 @@ void CLLSTMLayer::run() { _fully_connected_input_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_input_gate.run(); _accum_input_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_input_gate.run(); _pixelwise_mul_input_gate_coeff.run(); @@ -679,12 +870,10 @@ void CLLSTMLayer::run() ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights); pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2); - CLScheduler::get().enqueue_op(*_transpose_cell_state, - pack, - false); + CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false); _gemm_cell_state1.run(); _accum_cell_state1.run(); - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); _pixelwise_mul_cell_gate_coeff.run(); @@ -695,19 +884,19 @@ void CLLSTMLayer::run() _pixelwise_mul_cell_state2.run(); _accum_cell_state2.run(); - if(_perform_cell_clipping) + if (_perform_cell_clipping) { _cell_clip.run(); } _fully_connected_output.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_output_state1.run(); _accum_output1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_output_gate.run(); _pixelwise_mul_output_gate_coeff.run(); @@ -718,10 +907,10 @@ void CLLSTMLayer::run() _activation_output_state.run(); _pixelwise_mul_output_state2.run(); - if(_has_projection_weights) + if (_has_projection_weights) { _fully_connected_output_state.run(); - if(_perform_projection_clipping) + if (_perform_projection_clipping) { _projection_clip.run(); } @@ -735,10 +924,10 @@ void CLLSTMLayer::run() void CLLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _concat_weights_forget_gate.run(); - if(!_run_cifg_opt) + if (!_run_cifg_opt) { _concat_weights_input_gate.run(); } diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp index d14c6102d5..ea64eda023 100644 --- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp +++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp @@ -25,12 +25,12 @@ #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/helpers/AutoConfiguration.h" +#include "arm_compute/core/Validate.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/helpers/AutoConfiguration.h" #include @@ -46,48 +46,129 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit } // namespace CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(), - _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(), - _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), - _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), - _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), - _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), - _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(), - _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false) + : _memory_group(std::move(memory_manager)), + _gemmlowp(), + _output_stage(), + _transpose_weights(), + _concat_input_weights(), + _concat_recurrent_weights(), + _concat_weights(), + _concat_inputs(), + _concat_bias(), + _sigmoid_forget_gate(), + _sigmoid_input_gate(), + _sigmoid_output_gate(), + _tanh_modulation_gate(), + _tanh_output_state(), + _add_cell_state_tmps(), + _add2(), + _mul_forget_gate_cell_state(), + _mul_input_gate_input_mod_gate(), + _mul_output_state_tmp_output_gate(), + _slice_input_tensor(), + _slice_forget_tensor(), + _slice_cell_tensor(), + _slice_output_tensor(), + _dequantize(), + _quantize(), + _input_to_input_weights(nullptr), + _input_to_forget_weights(nullptr), + _input_to_cell_weights(nullptr), + _input_to_output_weights(nullptr), + _recurrent_to_input_weights(nullptr), + _recurrent_to_forget_weights(nullptr), + _recurrent_to_cell_weights(nullptr), + _recurrent_to_output_weights(nullptr), + _input_gate_bias(nullptr), + _forget_gate_bias(nullptr), + _cell_bias(nullptr), + _output_gate_bias(nullptr), + _recurrent_weights(), + _input_weights(), + _weights(), + _input(), + _weights_transposed(), + _output_highp(), + _output_lowp(), + _bias(), + _forget_gate_input(), + _input_gate_input(), + _output_gate_input(), + _input_modulation_gate_input(), + _forget_gate_output(), + _input_gate_output(), + _output_gate_output(), + _input_modulation_gate_output(), + _cell_state_tmp1(), + _cell_state_tmp2(), + _output_state_tmp(), + _output_state_out_symm(), + _output_state_out_f32(), + _is_prepared(false) { } void CLLSTMLayerQuantized::configure(const ICLTensor *input, - const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out) + const ICLTensor *input_to_input_weights, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + const ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, - output_state_out); + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); } -void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out) +void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_input_weights, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + const ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - - ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, + cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), - input_to_output_weights->info(), - recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info())); + ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate( + input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), + recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info())); const int input_size = input->info()->dimension(0); const int batch_size = input->info()->dimension(1); @@ -95,8 +176,10 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization - auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); - auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); + auto_init_if_empty(*cell_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); + auto_init_if_empty(*output_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); _input_to_input_weights = input_to_input_weights; _input_to_forget_weights = input_to_forget_weights; @@ -124,17 +207,20 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); - _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _input_weights.allocator()->init( + TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY); - _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _recurrent_weights.allocator()->init( + TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY); std::vector weights_vector; weights_vector.emplace_back(&_recurrent_weights); weights_vector.emplace_back(&_input_weights); - _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _weights.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX); _transpose_weights.configure(compile_context, &_weights, &_weights_transposed); @@ -144,7 +230,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co input_vector.emplace_back(output_state_in); _memory_group.manage(&_input); - _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); + _input.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX); // Bias concatenation @@ -159,7 +246,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co // Invert the offset for gemmlowp _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); // Run gemmlowp _memory_group.manage(&_output_highp); @@ -169,7 +257,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co // Set the offset back _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3)); @@ -191,85 +280,111 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co _bias.allocator()->allocate(); // Get the gate tensors - if(batch_size > 1) + if (batch_size > 1) { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0}, + {output_size, batch_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0}, + {2 * output_size, batch_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, + {2 * output_size, 0}, {3 * output_size, batch_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0}, + {4 * output_size, batch_size}); _output_lowp.allocator()->allocate(); } else { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size}, + {2 * output_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size}, + {3 * output_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size}, + {4 * output_size}); _output_lowp.allocator()->allocate(); } // Forget gate _memory_group.manage(&_forget_gate_output); - _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_output.allocator()->init( + TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _forget_gate_input.allocator()->allocate(); // Input gate _memory_group.manage(&_input_gate_output); - _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_output.allocator()->init( + TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _input_gate_input.allocator()->allocate(); // Input modulation gate equation _memory_group.manage(&_input_modulation_gate_output); - _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _input_modulation_gate_output.allocator()->init( + TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _input_modulation_gate_input.allocator()->allocate(); // Output gate _memory_group.manage(&_output_gate_output); - _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_output.allocator()->init( + TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _output_gate_input.allocator()->allocate(); // Long term memory _memory_group.manage(&_cell_state_tmp1); - _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state_tmp1.allocator()->init( + TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _forget_gate_output.allocator()->allocate(); _memory_group.manage(&_cell_state_tmp2); - _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state_tmp2.allocator()->init( + TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, + &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _input_modulation_gate_output.allocator()->allocate(); _input_gate_output.allocator()->allocate(); - _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE); + _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, + ConvertPolicy::SATURATE); _cell_state_tmp1.allocator()->allocate(); _cell_state_tmp2.allocator()->allocate(); // Short term memory _memory_group.manage(&_output_state_tmp); - _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _output_state_tmp.allocator()->init( + TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _memory_group.manage(&_output_state_out_symm); - _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _output_state_out_symm.allocator()->init( + TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, + &_output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate_output.allocator()->allocate(); _output_state_tmp.allocator()->allocate(); // Requantize the output state from QSYMM16 to QASYMM8 _memory_group.manage(&_output_state_out_f32); - _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); + _output_state_out_f32.allocator()->init( + TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32); _output_state_out_symm.allocator()->allocate(); @@ -278,15 +393,28 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co } Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out) + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, - output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8); const int input_size = input->dimension(0); @@ -299,29 +427,51 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); - TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); - TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); - TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); + TensorInfo input_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(input_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(output_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo bias_info( + input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); + TensorInfo output_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QASYMM8) + .set_quantization_info(qasymm)); + TensorInfo cell_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QSYMM16) + .set_quantization_info(qsymm_4)); // Shape checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in); // Data type checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); // Quantization checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); @@ -343,7 +493,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); - ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); // _concat_weights std::vector weights_vector; @@ -353,7 +504,7 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); // _transpose_weights const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); - TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed)); // _concat_inputs @@ -379,7 +530,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, // _gemmlowp const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); // Set the offset back input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); @@ -390,7 +542,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage GEMMLowpOutputStageInfo info{}; @@ -405,68 +558,91 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, TensorInfo input_modulation_gate_input; TensorInfo output_gate_input; - if(batch_size > 1) + if (batch_size > 1) { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size})); } else { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size})); } // _sigmoid_forget_gate const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _sigmoid_input_gate const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _tanh_modulation_gate - const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, + qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _sigmoid_output_gate const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&output_gate_input, &output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _mul_forget_gate_cell_state const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); // _mul_input_gate_input_mod_gate const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, + &cell_state_tmp2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _add_cell_state_tmps - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); // _tanh_modulation_gate const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, &output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _mul_output_state_tmp_output_gate const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, + &output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _dequantize const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); @@ -475,14 +651,14 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, // _quantize ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out)); - if(cell_state_out->total_size() != 0) + if (cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out); } - if(output_state_out->total_size() != 0) + if (output_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out); @@ -541,7 +717,7 @@ void CLLSTMLayerQuantized::run() void CLLSTMLayerQuantized::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _input_weights.allocator()->allocate(); _concat_input_weights.run(); diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp index 696191c485..ea21c54bc3 100644 --- a/src/runtime/CL/functions/CLLogicalAnd.cpp +++ b/src/runtime/CL/functions/CLLogicalAnd.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include @@ -33,7 +34,10 @@ namespace arm_compute { namespace experimental { -void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) +void CLLogicalAnd::configure(const CLCompileContext &compile_context, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique(); @@ -54,17 +58,16 @@ void CLLogicalAnd::run(ITensorPack &tensors) struct CLLogicalAnd::Impl { - const ICLTensor *src0{ nullptr }; - const ICLTensor *src1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src0{nullptr}; + const ICLTensor *src1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLLogicalAnd::CLLogicalAnd() - : _impl(std::make_unique()) +CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique()) { } -CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default; +CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default; CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default; CLLogicalAnd::~CLLogicalAnd() = default; @@ -73,7 +76,10 @@ void CLLogicalAnd::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLLogicalAnd::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLLogicalAnd::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { _impl->src0 = input1; _impl->src1 = input2; diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp index a0504d7852..71f9cce54f 100644 --- a/src/runtime/CL/functions/CLLogicalNot.cpp +++ b/src/runtime/CL/functions/CLLogicalNot.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClLogicalNot.h" @@ -32,16 +33,15 @@ namespace arm_compute { struct CLLogicalNot::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLLogicalNot::CLLogicalNot() - : _impl(std::make_unique()) +CLLogicalNot::CLLogicalNot() : _impl(std::make_unique()) { } -CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default; +CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default; CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default; CLLogicalNot::~CLLogicalNot() = default; @@ -72,4 +72,4 @@ void CLLogicalNot::run() _impl->op->run(pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp index f9a606e8a5..3db4fdae84 100644 --- a/src/runtime/CL/functions/CLLogicalOr.cpp +++ b/src/runtime/CL/functions/CLLogicalOr.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLLogicalOr.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include @@ -33,7 +34,10 @@ namespace arm_compute { namespace experimental { -void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) +void CLLogicalOr::configure(const CLCompileContext &compile_context, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique(); @@ -54,17 +58,16 @@ void CLLogicalOr::run(ITensorPack &tensors) struct CLLogicalOr::Impl { - const ICLTensor *src0{ nullptr }; - const ICLTensor *src1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src0{nullptr}; + const ICLTensor *src1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLLogicalOr::CLLogicalOr() - : _impl(std::make_unique()) +CLLogicalOr::CLLogicalOr() : _impl(std::make_unique()) { } -CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default; +CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default; CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default; CLLogicalOr::~CLLogicalOr() = default; @@ -73,7 +76,10 @@ void CLLogicalOr::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLLogicalOr::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLLogicalOr::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { _impl->src0 = input1; _impl->src1 = input2; diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp index bef422fca1..e8bdad706b 100644 --- a/src/runtime/CL/functions/CLMatMul.cpp +++ b/src/runtime/CL/functions/CLMatMul.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLMatMul.h" + #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTypes.h" + #include "src/gpu/cl/operators/ClMatMul.h" namespace arm_compute @@ -32,23 +34,32 @@ using OperatorType = opencl::ClMatMul; struct CLMatMul::Impl { - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; }; -CLMatMul::CLMatMul() - : _impl(std::make_unique()) +CLMatMul::CLMatMul() : _impl(std::make_unique()) { } CLMatMul::~CLMatMul() = default; -void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +void CLMatMul::configure(ICLTensor *lhs, + ICLTensor *rhs, + ICLTensor *output, + const MatMulInfo &matmul_info, + const GpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(settings); configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info); } -void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, +void CLMatMul::configure(const CLCompileContext &compile_context, + ICLTensor *lhs, + ICLTensor *rhs, + ICLTensor *output, + const MatMulInfo &matmul_info, + const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); @@ -56,10 +67,14 @@ void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs _impl->op = std::make_unique(); _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info); - _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } }; + _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}}; } -Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) +Status CLMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *output, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) { return OperatorType::validate(lhs, rhs, output, matmul_info, act_info); } diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp index 2786d32d33..7494f379b9 100644 --- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp +++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp @@ -27,26 +27,32 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" namespace arm_compute { CLMaxUnpoolingLayer::CLMaxUnpoolingLayer() - : _fill(), - _unpooling_layer_kernel(std::make_unique()) + : _fill(), _unpooling_layer_kernel(std::make_unique()) { } CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default; -void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) +void CLMaxUnpoolingLayer::configure(ICLTensor *input, + ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info); } -void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) +void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info); const PixelValue zero_value(0.f); @@ -55,7 +61,10 @@ void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICL _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info); } -Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) { return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info); } diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp index a81cbca1b0..5892c0e840 100644 --- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" namespace arm_compute { @@ -35,7 +35,10 @@ void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *outp configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon); } -void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon) +void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float epsilon) { ARM_COMPUTE_LOG_PARAMS(input, output, epsilon); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp index c0cc5184e6..f93f82f1a2 100644 --- a/src/runtime/CL/functions/CLNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLNormalizationLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLNormalizationLayerKernel.h" namespace arm_compute { @@ -50,7 +50,10 @@ void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info); } -void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info) +void CLNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_ERROR_ON(input == nullptr); ARM_COMPUTE_LOG_PARAMS(input, output, norm_info); @@ -58,21 +61,24 @@ void CLNormalizationLayer::configure(const CLCompileContext &compile_context, IC // Configure normalization kernel _norm_kernel->configure(compile_context, input, output, norm_info); - if(!_norm_kernel->border_size().empty()) + if (!_norm_kernel->border_size().empty()) { // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel - _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue()); + _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, + PixelValue()); } } -Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status CLNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { return CLNormalizationLayerKernel::validate(input, output, norm_info); } void CLNormalizationLayer::run() { - if(!_norm_kernel->border_size().empty()) + if (!_norm_kernel->border_size().empty()) { // Run border handler CLScheduler::get().enqueue(*_border_handler, false); diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp index 63c9164a94..939c95bd45 100644 --- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp @@ -24,20 +24,26 @@ #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h" -#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" #include namespace arm_compute { -void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std); } -void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { ARM_COMPUTE_LOG_PARAMS(input, output, mean, std); auto k = std::make_unique(); @@ -45,8 +51,10 @@ void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_contex _kernel = std::move(k); } -Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *std) +Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *std) { return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std); } diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp index 186e7b4ba2..ce6d285ebe 100644 --- a/src/runtime/CL/functions/CLPReluLayer.cpp +++ b/src/runtime/CL/functions/CLPReluLayer.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLPReluLayer.h" + #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/gpu/cl/IClKernel.h" #include "src/gpu/cl/operators/ClPRelu.h" @@ -33,17 +35,16 @@ using OperatorType = opencl::ClPRelu; struct CLPReluLayer::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLPReluLayer::CLPReluLayer() - : _impl(std::make_unique()) +CLPReluLayer::CLPReluLayer() : _impl(std::make_unique()) { } -CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default; +CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default; CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default; CLPReluLayer::~CLPReluLayer() = default; @@ -52,13 +53,17 @@ void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *outp configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output); } -void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +void CLPReluLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *alpha, + ICLTensor *output) { _impl->src_0 = input; _impl->src_1 = alpha; _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), alpha->info(), (output == nullptr ? input->info() : output->info())); + _impl->op->configure(compile_context, input->info(), alpha->info(), + (output == nullptr ? input->info() : output->info())); } Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp index 0ed8f03d64..e788ded512 100644 --- a/src/runtime/CL/functions/CLPadLayer.cpp +++ b/src/runtime/CL/functions/CLPadLayer.cpp @@ -22,37 +22,38 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLPadLayer.h" -#include "src/core/CL/kernels/CLPadLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLPadLayerKernel.h" namespace arm_compute { -CLPadLayer::CLPadLayer() - : _pad_kernel(std::make_unique()), - _copy(), - _perform_pad(false) +CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique()), _copy(), _perform_pad(false) { } CLPadLayer::~CLPadLayer() = default; -void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayer::configure( + ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) { configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode); } -void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode)); ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode); - _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) - { - return info.first > 0 || info.second > 0; - }); + _perform_pad = + std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); - if(_perform_pad) + if (_perform_pad) { _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode); } @@ -62,14 +63,16 @@ void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *i _copy.configure(compile_context, input, output); } } -Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +Status CLPadLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { - bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) - { - return info.first > 0 || info.second > 0; - }); + bool perform_pad = + std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); - if(perform_pad) + if (perform_pad) { ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode)); } @@ -81,7 +84,7 @@ Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, } void CLPadLayer::run() { - if(_perform_pad) + if (_perform_pad) { CLScheduler::get().enqueue(*_pad_kernel); } diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp index a56afff7df..7f97eed98a 100644 --- a/src/runtime/CL/functions/CLPermute.cpp +++ b/src/runtime/CL/functions/CLPermute.cpp @@ -27,22 +27,21 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClPermute.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClPermute.h" namespace arm_compute { struct CLPermute::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLPermute::CLPermute() - : _impl(std::make_unique()) +CLPermute::CLPermute() : _impl(std::make_unique()) { } @@ -53,7 +52,10 @@ void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const Permu configure(CLKernelLibrary::get().get_compile_context(), input, output, perm); } -void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm) +void CLPermute::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PermutationVector &perm) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, perm); diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp index 9d91e58367..6aa9d9cbb3 100644 --- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp +++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClMul.h" @@ -34,38 +35,55 @@ namespace arm_compute { struct CLPixelWiseMultiplication::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLPixelWiseMultiplication::CLPixelWiseMultiplication() - : _impl(std::make_unique()) +CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique()) { } -CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default; +CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default; CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default; CLPixelWiseMultiplication::~CLPixelWiseMultiplication() = default; -void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void CLPixelWiseMultiplication::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info); + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, + rounding_policy, act_info); } -void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, + rounding_policy, act_info); } -Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } @@ -82,26 +100,33 @@ void CLPixelWiseMultiplication::run() struct CLComplexPixelWiseMultiplication::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() - : _impl(std::make_unique()) +CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique()) { } CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default; -CLComplexPixelWiseMultiplication &CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default; -CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default; - -void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +CLComplexPixelWiseMultiplication & +CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default; +CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default; + +void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -110,7 +135,10 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClComplexMul::validate(input1, input2, output, act_info); } diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp index 11ae1d0fe6..ce1092a7cc 100644 --- a/src/runtime/CL/functions/CLPooling3dLayer.cpp +++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClPool3d.h" @@ -32,14 +33,13 @@ namespace arm_compute { struct CLPooling3dLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - ICLTensor *indices{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + ICLTensor *indices{nullptr}; + std::unique_ptr op{nullptr}; }; -CLPooling3dLayer::CLPooling3dLayer() - : _impl(std::make_unique()) +CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique()) { } CLPooling3dLayer::~CLPooling3dLayer() = default; @@ -49,7 +49,10 @@ void CLPooling3dLayer::configure(const ICLTensor *input, ICLTensor *output, cons configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info); } -void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info) +void CLPooling3dLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Pooling3dLayerInfo &pool_info) { _impl->src = input; _impl->dst = output; @@ -58,7 +61,8 @@ void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const _impl->op->configure(compile_context, input->info(), output->info(), pool_info); } -Status CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) +Status +CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) { return opencl::ClPool3d::validate(input, output, pool_info); } diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp index 0ebce318fa..65e53b9be3 100644 --- a/src/runtime/CL/functions/CLPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLPoolingLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClPool2d.h" @@ -32,34 +33,44 @@ namespace arm_compute { struct CLPoolingLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - ICLTensor *indices{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + ICLTensor *indices{nullptr}; + std::unique_ptr op{nullptr}; }; -CLPoolingLayer::CLPoolingLayer() - : _impl(std::make_unique()) +CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique()) { } CLPoolingLayer::~CLPoolingLayer() = default; -void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) +void CLPoolingLayer::configure(ICLTensor *input, + ICLTensor *output, + const PoolingLayerInfo &pool_info, + ICLTensor *indices) { configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices); } -void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) +void CLPoolingLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PoolingLayerInfo &pool_info, + ICLTensor *indices) { _impl->src = input; _impl->dst = output; _impl->indices = indices; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr); + _impl->op->configure(compile_context, input->info(), output->info(), pool_info, + (indices) ? indices->info() : nullptr); } -Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status CLPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { return opencl::ClPool2d::validate(input, output, pool_info, indices); } diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp index 019f0a7e61..cfd0ec4fbf 100644 --- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp +++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp @@ -29,31 +29,40 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h" using namespace arm_compute; -CLPriorBoxLayer::CLPriorBoxLayer() - : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr) +CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr) { } -void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) +void CLPriorBoxLayer::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info); } -void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) +void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info); - _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float)); - _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float)); - if(!info.max_sizes().empty()) + _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.min_sizes().size() * sizeof(float)); + _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.aspect_ratios().size() * sizeof(float)); + if (!info.max_sizes().empty()) { - _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float)); + _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.max_sizes().size() * sizeof(float)); } auto k = std::make_unique(); @@ -61,7 +70,10 @@ void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const I _kernel = std::move(k); } -Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status CLPriorBoxLayer::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { return CLPriorBoxLayerKernel::validate(input1, input2, output, info); } diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp index 7fbb866fa9..12f6f89290 100644 --- a/src/runtime/CL/functions/CLQLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp @@ -26,29 +26,36 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" #include "src/core/helpers/WindowHelpers.h" #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { using namespace arm_compute::utils::info_helpers; using namespace arm_compute::opencl::kernels; namespace { -Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, - float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info) +Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensorInfo *mm_input, + const ITensorInfo *mm_weights, + const ITensorInfo *bias, + float gemmlowp_scale, + const TensorInfo *mm_res_info, + const TensorInfo *outstage_tensor_info) { ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info)); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); return Status{}; } } // namespace @@ -78,14 +85,12 @@ void CLQLSTMLayer::TensorCopyKernel::run() _src->map(q, true); _dst->map(q, true); - Iterator input_iter{ _src, _window }; - Iterator output_iter{ _dst, _window }; + Iterator input_iter{_src, _window}; + Iterator output_iter{_dst, _window}; - execute_window_loop(_window, [&](const Coordinates &) - { - memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); - }, - input_iter, output_iter); + execute_window_loop( + _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter, + output_iter); _src->unmap(q); _dst->unmap(q); @@ -104,7 +109,7 @@ CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr memory_manager) _layer_norms(), _copy_output() { - for(auto &norm : _layer_norms) + for (auto &norm : _layer_norms) { norm = std::make_unique(); } @@ -129,17 +134,22 @@ Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf { // Output quantization scale will be different, but ignored here // since it will be configured at configure() stage. - const TensorInfo out - { - in - }; + const TensorInfo out{in}; return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); } -void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, - CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info) +void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, + CLGEMMLowpMatrixMultiplyCore &mm, + CLGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ICLTensor *mm_input, + const ICLTensor *mm_weights, + const ICLTensor *bias, + CLTensor *mm_res, + CLTensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info) { _memory_group.manage(mm_res); _memory_group.manage(outstage_res); @@ -151,30 +161,51 @@ void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMML mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res); // Configure output stage - quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); + quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info); mm_res->allocator()->allocate(); } -void CLQLSTMLayer::configure(const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output, +void CLQLSTMLayer::configure(const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out, + ICLTensor *output, const LSTMParams &lstm_params) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, - cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params); + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, + output_state_in, cell_state_out, output_state_out, output, lstm_params); } -void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output, +void CLQLSTMLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out, + ICLTensor *output, const LSTMParams &lstm_params) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, @@ -191,11 +222,11 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info)); const int batch_size = input->info()->dimension(1); const int num_units = input_to_output_weights->info()->dimension(1); @@ -216,7 +247,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT // Layer normalization _has_layer_norm = lstm_params.use_layer_norm(); - if(_has_layer_norm) + if (_has_layer_norm) { set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget); set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell); @@ -238,53 +269,75 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } _has_cell_clipping = quantized_cell_clip > 0; // Precompute effective bias for optimizing the matmul computations. - if(!_has_cifg) + if (!_has_cifg) { _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), + _input_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure( + compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } - _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true)); - _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true)); - _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true)); - if(_has_projection) + _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), + _input_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure( + compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure( + compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), + _input_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure( + compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + if (_has_projection) { - _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); - if(_projection_bias != nullptr) + _projection_reduction->configure( + compile_context, _projection_weights->info(), _projection_eff_bias.info(), + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + if (_projection_bias != nullptr) { - _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); + _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, + &_projection_eff_bias, ConvertPolicy::SATURATE); } } // Pre-transpose weights to be used in GEMM. - _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed); - _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed); - _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed); - _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); - _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); - _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); - if(!_has_cifg) + _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, + &_input_to_forget_weights_transposed); + _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, + &_input_to_cell_weights_transposed); + _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, + &_input_to_output_weights_transposed); + _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, + &_recurrent_to_forget_weights_transposed); + _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, + &_recurrent_to_cell_weights_transposed); + _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, + &_recurrent_to_output_weights_transposed); + if (!_has_cifg) { - _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); - _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); + _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), + &_input_to_input_weights_transposed); + _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), + &_recurrent_to_input_weights_transposed); } - if(_has_projection) + if (_has_projection) { _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed); } @@ -297,42 +350,55 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); // Forget gate. - const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); - const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, - input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, - &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.forget_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input, + &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res, + &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info); - _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _input_to_forget_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_forget_res); - _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), + &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _cell_to_forget_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_forget_outstage_res); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); + const float cell_to_forget_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, + &_cell_to_forget_outstage_res, gemmlowp_info); _mul_cell_to_forget_res.allocator()->allocate(); - _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, + &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _cell_to_forget_outstage_res.allocator()->allocate(); } CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res); _recurrent_to_forget_outstage_res.allocator()->allocate(); @@ -345,30 +411,33 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_forget_gate); _forget_gate.allocator()->init(forget_gate_info); - _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); forget_activation_input->allocator()->allocate(); // Modulation gate. - const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, - input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, - &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, - mm_out_info, cell_outstage_info); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, - &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, - mm_out_info, cell_outstage_info); - - _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, + const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.cell_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, + &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res, + &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res, + &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info); + + _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, + &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); _input_to_cell_outstage_res.allocator()->allocate(); CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res); _recurrent_to_cell_outstage_res.allocator()->allocate(); @@ -378,14 +447,15 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_cell_gate); _cell_gate.allocator()->init(cell_gate_info); - _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); cell_activation_input->allocator()->allocate(); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _input_gate.allocator()->init(input_gate_info); _memory_group.manage(&_input_gate); - if(_has_cifg) + if (_has_cifg) { _ones.allocator()->init(*_forget_gate.info()); _input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); @@ -393,107 +463,142 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT } else { - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, - input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias, - &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale, - mm_out_info, input_outstage_info); - - const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input, + &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res, + &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info); + + const float recurrent_to_input_scale = + _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale, mm_out_info, input_outstage_info); - _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, - ConvertPolicy::SATURATE); + _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, + &_recurrent_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _input_to_input_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); + _mul_cell_to_input_res.allocator()->init( + TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_input_res); - _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), + &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + const float cell_to_input_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_input_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_input_outstage_res); - _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); + _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, + &_cell_to_input_outstage_res, gemmlowp_info); _mul_cell_to_input_res.allocator()->allocate(); - _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _cell_to_input_outstage_res.allocator()->allocate(); } CLTensor *input_activation_input = &_recurrent_to_input_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res); _recurrent_to_input_outstage_res.allocator()->allocate(); input_activation_input = &get_layer_norm_output(LayerNormGate::Input); } - _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); input_activation_input->allocator()->allocate(); } // Cell. // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication - _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); - const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0)); + const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(mul_input_cell_scale, 0)); _memory_group.manage(&_mul_input_cell_res); _mul_input_cell_res.allocator()->init(mul_input_cell_info); - _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_gate.allocator()->allocate(); - _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); + _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, + ConvertPolicy::SATURATE); _mul_input_cell_res.allocator()->allocate(); _forget_gate.allocator()->allocate(); - if(_has_cell_clipping) + if (_has_cell_clipping) { - _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); + _cell_clip.configure(compile_context, cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip)); } // Output gate. - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, - input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias, - &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale, - mm_out_info, output_outstage_info); - - const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.output_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input, + &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res, + &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info); + + const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info); - _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, + _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, + &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _input_to_output_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_output_res); - _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - - const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), + &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + + const float cell_to_output_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / + lstm_params.output_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_output_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_output_outstage_res); - _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info); + _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, + &_cell_to_output_outstage_res, gemmlowp_info); _mul_cell_to_output_res.allocator()->allocate(); - _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, + _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, + &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _cell_to_output_outstage_res.allocator()->allocate(); } CLTensor *output_activation_input = &_recurrent_to_output_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res); _recurrent_to_output_outstage_res.allocator()->allocate(); @@ -503,20 +608,24 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_output_gate); _output_gate.allocator()->init(output_gate_info); - _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); output_activation_input->allocator()->allocate(); // Hidden. - _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); - _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _output_gate.allocator()->allocate(); _input_gate.allocator()->allocate(); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = output_state_in->info()->data_type(); @@ -525,7 +634,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _memory_group.manage(&_hidden_gate); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->init(*output_state_out->info()); _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape()); @@ -536,27 +645,26 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _hidden_mul_res.allocator()->allocate(); // Projection. - if(_has_projection) + if (_has_projection) { const TensorInfo projection_outstage_info(*output_state_out->info()); - const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; - gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); - gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); - gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; - - TensorInfo projection_mm_out_info{ mm_out_info }; + const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; + gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); + gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); + gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; + + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, - hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias, - &_mm_projection_res, &_projection_outstage_res, projection_scale, - projection_mm_out_info, projection_outstage_info); + configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result, + &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res, + &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info); ICLTensor *accumulate_destination = output_state_out; - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->allocate(); _projection_accumulate_res.allocator()->init(*output_state_in->info()); @@ -565,31 +673,34 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT accumulate_destination = &_projection_accumulate_res; } - _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE); + _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, + accumulate_destination, ConvertPolicy::SATURATE); _projection_outstage_res.allocator()->allocate(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out); _projection_accumulate_res.allocator()->allocate(); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { - quantized_projection_clip = utility::clamp(lstm_params.projection_clip() / qprojection.scale, -128, 127); + quantized_projection_clip = + utility::clamp(lstm_params.projection_clip() / qprojection.scale, -128, 127); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip)); + _projection_clip.configure(compile_context, output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip)); _has_projection_clipping = true; } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.configure(_hidden_gate, *output_state_out); _hidden_gate.allocator()->allocate(); @@ -600,17 +711,27 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _copy_output.configure(compile_context, output_state_out, output); } -Status CLQLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, +Status CLQLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams &lstm_params) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, - cell_state_out, output_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + cell_state_in, output_state_in, cell_state_out, output_state_out, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions"); @@ -622,13 +743,16 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, + input_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units); @@ -647,20 +771,25 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in); // Check whether peephole weights are all there or none - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); } } @@ -674,7 +803,7 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } @@ -682,33 +811,50 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Precompute effective bias for optimizing the matmul computations. const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32); const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.input_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.recurrent_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_forget_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_cell_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_output_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, - lstm_params.hidden_state_zero(), - true))); - if(lstm_params.projection_bias() != nullptr) + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.projection_weights(), &projection_eff_bias_info, + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); + if (lstm_params.projection_bias() != nullptr) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, - &projection_eff_bias_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, + &projection_eff_bias_info, ConvertPolicy::SATURATE)); } } - const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info()); - const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); + const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, + input_to_forget_weights->data_type(), + input_to_forget_weights->quantization_info()); + const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); // Validate weights transpose ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed)); @@ -717,15 +863,20 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed)); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); } - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); } GEMMLowpOutputStageInfo gemmlowp_info; @@ -738,28 +889,42 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Forget gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0); - const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); - const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_forget_scale, &mm_out_info, &forget_outstage_info)); - const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, + &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_forget_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights(); const ITensorInfo *b_info = forget_gate_bias; @@ -770,20 +935,29 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Modulation gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0); - const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); - - if(has_layer_norm) + const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_cell_scale, &mm_out_info, &cell_outstage_info)); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, + &cell_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, + &cell_outstage_info, ConvertPolicy::SATURATE)); + + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights(); const ITensorInfo *b_info = cell_bias; @@ -791,85 +965,123 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, + "Input gate bias must not be present when CIFG is used"); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, + &forget_gate_info, ConvertPolicy::SATURATE)); } else { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, + lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0); - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info)); - - const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); - - if(lstm_params.has_peephole_opt()) + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_input_scale, &mm_out_info, &input_outstage_info)); + + const float recurrent_to_input_scale = + lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_input_scale, &mm_out_info, + &input_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); + + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, + 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_input_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.input_layer_norm_weights(); const ITensorInfo *b_info = lstm_params.input_gate_bias(); ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_outstage_info, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f))); } // Cell. - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); - if(quantized_cell_clip > 0) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + if (quantized_cell_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, - quantized_cell_clip))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip))); } // Output gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0); - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info)); - - const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_output_scale, &mm_out_info, &output_outstage_info)); + + const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_output_scale, &mm_out_info, + &output_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, + DataType::QSYMM16); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel // Here we are not using the output stage because all operations are done in float // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.output_layer_norm_weights(); const ITensorInfo *b_info = output_gate_bias; @@ -877,85 +1089,103 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&output_outstage_info, &output_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Hidden. - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32); const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = hidden_out_info.data_type(); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); const bool projection_tensor_copy_required = num_units != output_size; // Projection. - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, + lstm_params.projection_weights()); ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0); - const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; const TensorInfo projection_outstage_info(*output_state_out); - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); - TensorInfo projection_mm_out_info{ mm_out_info }; + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, + &projection_eff_bias_info, projection_scale, &projection_mm_out_info, &projection_outstage_info)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, + ConvertPolicy::SATURATE)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip))); } } else { - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out)); } } - if(cell_state_out->total_size() > 0) + if (cell_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out); } - if(output_state_out->total_size() > 0) + if (output_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out); @@ -980,14 +1210,14 @@ void CLQLSTMLayer::run() _recurrent_to_forget_outstage.run(); _accumulate_input_recurrent_forget.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_forget.run(); _cell_to_forget_outstage.run(); _accumulate_cell_forget.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget)); } @@ -1002,7 +1232,7 @@ void CLQLSTMLayer::run() _recurrent_to_cell_outstage.run(); _accumulate_input_recurrent_modulation.run(); - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell)); } @@ -1010,7 +1240,7 @@ void CLQLSTMLayer::run() _cell_gate_tanh.run(); // Input gate - if(_has_cifg) + if (_has_cifg) { _input_gate_sub.run(); } @@ -1022,14 +1252,14 @@ void CLQLSTMLayer::run() _recurrent_to_input_outstage.run(); _accumulate_input_recurrent_input.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_input.run(); _cell_to_input_outstage.run(); _accumulate_cell_input.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input)); } @@ -1041,7 +1271,7 @@ void CLQLSTMLayer::run() _pixelwise_mul_forget_cell.run(); _pixelwise_mul_input_cell.run(); _add_forget_cell.run(); - if(_has_cell_clipping) + if (_has_cell_clipping) { _cell_clip.run(); } @@ -1052,14 +1282,14 @@ void CLQLSTMLayer::run() _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); _accumulate_input_recurrent_output.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_output.run(); _cell_to_output_outstage.run(); _accumulate_cell_to_output.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output)); } @@ -1072,31 +1302,31 @@ void CLQLSTMLayer::run() _hidden_outstage.run(); // Projection. - if(_has_projection) + if (_has_projection) { _mm_projection.run(); _projection_outstage.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_output_to_accumulate_copy.run(); } _accumulate_projection.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.run(); } - if(_has_projection_clipping) + if (_has_projection_clipping) { _projection_clip.run(); } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.run(); } @@ -1108,7 +1338,7 @@ void CLQLSTMLayer::run() void CLQLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Pre-transpose weights to be used in GEMM. _input_to_forget_weights_transposed.allocator()->allocate(); @@ -1125,10 +1355,11 @@ void CLQLSTMLayer::prepare() _transpose_recurrent_to_output_weights.run(); // Precompute effective biases - if(_has_cifg) + if (_has_cifg) { _ones.map(true); - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767); + std::fill_n(reinterpret_cast(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 32767); _ones.unmap(); } else @@ -1136,10 +1367,12 @@ void CLQLSTMLayer::prepare() _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } }; + ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights}, + {ACL_DST, &_input_to_input_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false); - ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } }; + ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights}, + {ACL_DST, &_recurrent_to_input_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false); _input_to_input_weights_transposed.allocator()->allocate(); @@ -1156,30 +1389,35 @@ void CLQLSTMLayer::prepare() _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } }; + ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights}, + {ACL_DST, &_input_to_forget_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false); - ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } }; + ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights}, + {ACL_DST, &_recurrent_to_forget_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false); - ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } }; + ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false); - ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } }; + ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights}, + {ACL_DST, &_recurrent_to_cell_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false); - ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } }; + ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights}, + {ACL_DST, &_input_to_output_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false); - ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } }; + ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights}, + {ACL_DST, &_recurrent_to_output_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false); - if(_has_projection) + if (_has_projection) { _projection_eff_bias.allocator()->allocate(); - ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } }; + ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}}; CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false); - if(_projection_bias != nullptr) + if (_projection_bias != nullptr) { _projection_bias_add.run(); _projection_bias->mark_as_unused(); @@ -1189,7 +1427,7 @@ void CLQLSTMLayer::prepare() _transpose_projection_weights.run(); _projection_weights->mark_as_unused(); - if(!_projection_tensor_copy_required) + if (!_projection_tensor_copy_required) { _hidden_gate.mark_as_unused(); _projection_accumulate_res.mark_as_unused(); diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp index b249bdd1db..6edef29992 100644 --- a/src/runtime/CL/functions/CLQuantizationLayer.cpp +++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClQuantize.h" @@ -32,13 +33,12 @@ namespace arm_compute { struct CLQuantizationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLQuantizationLayer::CLQuantizationLayer() - : _impl(std::make_unique()) +CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique()) { } CLQuantizationLayer::~CLQuantizationLayer() = default; diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp index 6f122866b2..34b78eefa7 100644 --- a/src/runtime/CL/functions/CLRNNLayer.cpp +++ b/src/runtime/CL/functions/CLRNNLayer.cpp @@ -28,24 +28,37 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" namespace arm_compute { using namespace arm_compute::misc::shape_calculator; CLRNNLayer::CLRNNLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy(), _fully_connected_out(), _gemm_output(), _add_output(), + : _memory_group(std::move(memory_manager)), + _gemm_state_f(), + _add_kernel(), + _activation(), + _fully_connected_kernel(), + _copy(), + _fully_connected_out(), + _gemm_output(), + _add_output(), _is_prepared(false) { } CLRNNLayer::~CLRNNLayer() = default; -Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, - const ITensorInfo *output, const ActivationLayerInfo &info) +Status CLRNNLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, + const ActivationLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); @@ -63,28 +76,42 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape()); - auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); + auto shape_info = + TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; } -void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output, +void CLRNNLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *recurrent_weights, + const ICLTensor *bias, + ICLTensor *hidden_state, + ICLTensor *output, ActivationLayerInfo &info) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, + output, info); } -void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, - ICLTensor *hidden_state, - ICLTensor *output, ActivationLayerInfo &info) +void CLRNNLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *recurrent_weights, + const ICLTensor *bias, + ICLTensor *hidden_state, + ICLTensor *output, + ActivationLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), + bias->info(), hidden_state->info(), output->info(), info)); ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info); const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); @@ -133,7 +160,7 @@ void CLRNNLayer::run() void CLRNNLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _fully_connected_kernel.prepare(); _gemm_state_f.prepare(); diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp index 867ef7c7ac..1939d1d0ba 100644 --- a/src/runtime/CL/functions/CLROIAlignLayer.cpp +++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp @@ -24,26 +24,36 @@ #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h" #include "arm_compute/core/CL/ICLArray.h" -#include "src/core/CL/kernels/CLROIAlignLayerKernel.h" -#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLROIAlignLayerKernel.h" +#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" namespace arm_compute { -Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIAlignLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info)); return Status{}; } -void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayer::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp index 239a1c6bb2..0d2eab0c76 100644 --- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp @@ -22,24 +22,35 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h" + #include "arm_compute/core/CL/ICLArray.h" -#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" using namespace arm_compute; -Status CLROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info); } -void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayer::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + const ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp index 3fbbd5f952..5c3f7f9c8c 100644 --- a/src/runtime/CL/functions/CLRange.cpp +++ b/src/runtime/CL/functions/CLRange.cpp @@ -27,9 +27,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLRangeKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLRangeKernel.h" using namespace arm_compute; @@ -38,7 +38,8 @@ void CLRange::configure(ICLTensor *output, const float start, const float end, c configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step); } -void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) +void CLRange::configure( + const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) { ARM_COMPUTE_LOG_PARAMS(output, start, end, step); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp index cddbf77d7c..6c6daff5ba 100644 --- a/src/runtime/CL/functions/CLReduceMean.cpp +++ b/src/runtime/CL/functions/CLReduceMean.cpp @@ -27,23 +27,25 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace { -Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status +validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { ARM_COMPUTE_UNUSED(keep_dims); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); @@ -51,29 +53,29 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax const int input_dims = input->num_dimensions(); Coordinates axis_local = reduction_axis; - for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) + for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i) { //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast(input->num_dimensions()))); ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast(input->num_dimensions())); } - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { // Only validate if not using auto_init for the output tensor TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); ARM_COMPUTE_RETURN_ERROR_ON(static_cast(axis_local[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) + if (output->total_size() > 0 && keep_dims) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); } - if(keep_dims) + if (keep_dims) { out_shape.set(axis_local[i], 1); } @@ -87,8 +89,9 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax } const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); - const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info(); - if(requant) + const bool requant = + is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info(); + if (requant) { TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32)); CLDequantizationLayer::validate(input, &input_no_quant); @@ -98,10 +101,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax } return Status{}; } -} +} // namespace CLReduceMean::CLReduceMean(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(), + : _memory_group(std::move(memory_manager)), + _reduction_kernels(), + _reduced_outs(), + _reshape(), + _dequant(), + _requant(), + _reduction_ops(), + _keep_dims(), + _do_requant(), + _input_no_quant(), _output_no_quant() { } @@ -111,17 +123,23 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output); } -void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output) +void CLReduceMean::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const Coordinates &reduction_axis, + bool keep_dims, + ICLTensor *output) { // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output); // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - _do_requant = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info(); + _do_requant = is_data_type_quantized(input->info()->data_type()) && + input->info()->quantization_info() != output->info()->quantization_info(); _reduction_ops = reduction_axis.num_dimensions(); _reduction_kernels.resize(_reduction_ops); _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); @@ -129,7 +147,7 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor ICLTensor *tmp_input = input; ICLTensor *tmp_output = output; - if(_do_requant) + if (_do_requant) { _memory_group.manage(&_input_no_quant); _memory_group.manage(&_output_no_quant); @@ -148,46 +166,51 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { - TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + TensorShape out_shape = + i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]); - if(i == _reduction_ops - 1 && keep_dims) + if (i == _reduction_ops - 1 && keep_dims) { - _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], + ReductionOperation::MEAN_SUM); } else { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info())); + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), + tmp_input->info()->data_type(), + tmp_input->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); - _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], + ReductionOperation::MEAN_SUM); } } // Allocate intermediate tensors - for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } // Configure reshape layer if we want to drop the dimensions - if(!_keep_dims) + if (!_keep_dims) { TensorShape out_shape = tmp_input->info()->tensor_shape(); // We have to sort the reduction axis vectors in order for remove_dimension // to work properly std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { out_shape.remove_dimension(axis_local[i] - i, false); } auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape)); _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output); } - if(_do_requant) + if (_do_requant) { _requant.configure(compile_context, &_output_no_quant, output); _input_no_quant.allocator()->allocate(); @@ -195,7 +218,10 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor } } -Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status CLReduceMean::validate(const ITensorInfo *input, + const Coordinates &reduction_axis, + bool keep_dims, + const ITensorInfo *output) { return validate_config(input, reduction_axis, keep_dims, output); } @@ -204,19 +230,19 @@ void CLReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - if(_do_requant) + if (_do_requant) { _dequant.run(); } - for(auto &kernel : _reduction_kernels) + for (auto &kernel : _reduction_kernels) { kernel.run(); } - if(!_keep_dims) + if (!_keep_dims) { _reshape.run(); } - if(_do_requant) + if (_do_requant) { _requant.run(); } diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index cdc7fec51b..ba5489018e 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -27,35 +27,43 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/Utils.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLReductionOperation::CLReductionOperation(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _unreshaped_output(), _reduction_kernel(), _reshape(), _reduction_axis(), _is_reshape_required(false) + : _memory_group(std::move(memory_manager)), + _unreshaped_output(), + _reduction_kernel(), + _reshape(), + _reduction_axis(), + _is_reshape_required(false) { } CLReductionOperation::~CLReductionOperation() = default; -Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) +Status CLReductionOperation::validate( + const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const bool is_reshape_required = !keep_dims; - if(is_reshape_required && output->total_size() != 0) + if (is_reshape_required && output->total_size() != 0) { - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } @@ -67,22 +75,23 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf const auto input_qinfo = input->quantization_info(); const auto output_data_type = output->data_type(); - auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) - { + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, + QuantizationInfo qinfo) { ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo); }; - if(is_reshape_required) + if (is_reshape_required) { auto shape_before_reshape = input_shape; shape_before_reshape.set(axis, 1); - initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo); + initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, + input_qinfo); output_internal = &output_before_reshape; } ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op)); - if(is_reshape_required) + if (is_reshape_required) { ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output)); } @@ -92,7 +101,7 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output) { - if(!_is_reshape_required) + if (!_is_reshape_required) { return output; } @@ -103,12 +112,18 @@ ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor return &_unreshaped_output; } -void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void CLReductionOperation::configure( + ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims); } -void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void CLReductionOperation::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op, + bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims); @@ -117,11 +132,17 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC auto *output_internal = configure_intermediate_result_vector(input, output); - if(_is_reshape_required) + if (_is_reshape_required) { - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - const auto output_data_type = input->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + const auto output_data_type = input->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); _memory_group.manage(&_unreshaped_output); } @@ -129,7 +150,7 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC _reduction_kernel = std::make_unique(); _reduction_kernel->configure(compile_context, input, output_internal, axis, op); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.configure(compile_context, &_unreshaped_output, output); _unreshaped_output.allocator()->allocate(); @@ -142,7 +163,7 @@ void CLReductionOperation::run() CLScheduler::get().enqueue(*_reduction_kernel, false); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.run(); } diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp index 15de959225..156e9b90c1 100644 --- a/src/runtime/CL/functions/CLReorgLayer.cpp +++ b/src/runtime/CL/functions/CLReorgLayer.cpp @@ -27,9 +27,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/kernels/CLReorgLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLReorgLayerKernel.h" #include @@ -40,7 +40,10 @@ void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride configure(CLKernelLibrary::get().get_compile_context(), input, output, stride); } -void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride) +void CLReorgLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + int32_t stride) { ARM_COMPUTE_LOG_PARAMS(input, output, stride); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp index c51a3298c1..3d6349fb25 100644 --- a/src/runtime/CL/functions/CLReshapeLayer.cpp +++ b/src/runtime/CL/functions/CLReshapeLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClReshape.h" @@ -35,17 +36,16 @@ namespace arm_compute { struct CLReshapeLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLReshapeLayer::CLReshapeLayer() - : _impl(std::make_unique()) +CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique()) { } -CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default; +CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default; CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default; CLReshapeLayer::~CLReshapeLayer() = default; @@ -78,4 +78,4 @@ void CLReshapeLayer::run() _impl->op->run(pack); } } // namespace arm_compute -/** [CLReshapeLayer snippet] **/ \ No newline at end of file + /** [CLReshapeLayer snippet] **/ diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp index 1fc93571d9..415de52e64 100644 --- a/src/runtime/CL/functions/CLReverse.cpp +++ b/src/runtime/CL/functions/CLReverse.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLReverse.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLReverseKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLReverseKernel.h" namespace arm_compute { @@ -35,7 +35,10 @@ void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTe configure(CLKernelLibrary::get().get_compile_context(), input, output, axis); } -void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) +void CLReverse::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *axis) { ARM_COMPUTE_LOG_PARAMS(input, output, axis); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp index 5b78989bfa..abff0724e4 100644 --- a/src/runtime/CL/functions/CLScale.cpp +++ b/src/runtime/CL/functions/CLScale.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClScale.h" @@ -33,13 +34,12 @@ namespace arm_compute { struct CLScale::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLScale::CLScale() - : _impl(std::make_unique()) +CLScale::CLScale() : _impl(std::make_unique()) { } CLScale::~CLScale() = default; @@ -49,7 +49,10 @@ void CLScale::configure(ICLTensor *input, ICLTensor *output, const ScaleKernelIn configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info) +void CLScale::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ScaleKernelInfo &info) { _impl->src = input; _impl->dst = output; diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp index c4ab3dc67a..b4897d9e62 100644 --- a/src/runtime/CL/functions/CLSelect.cpp +++ b/src/runtime/CL/functions/CLSelect.cpp @@ -25,9 +25,9 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLSelectKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSelectKernel.h" using namespace arm_compute; @@ -38,7 +38,11 @@ void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output); } -void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output) +void CLSelect::configure(const CLCompileContext &compile_context, + const ICLTensor *c, + const ICLTensor *x, + const ICLTensor *y, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(c, x, y, output); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp index 7e3ac7d769..f79c6a1235 100644 --- a/src/runtime/CL/functions/CLSlice.cpp +++ b/src/runtime/CL/functions/CLSlice.cpp @@ -26,15 +26,19 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" -#include "src/core/CL/kernels/CLStridedSliceKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStridedSliceKernel.h" namespace arm_compute { namespace experimental { -void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +void CLSlice::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends); @@ -47,15 +51,16 @@ void CLSlice::configure(const CLCompileContext &compile_context, const ITensorIn _kernel = std::move(k); } -Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status CLSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); // Check start dimensions for being non-negative - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) - { - return i < 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; })); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); @@ -66,20 +71,22 @@ Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, co struct CLSlice::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLSlice::CLSlice() - : _impl(std::make_unique()) +CLSlice::CLSlice() : _impl(std::make_unique()) { } -CLSlice::CLSlice(CLSlice &&) = default; +CLSlice::CLSlice(CLSlice &&) = default; CLSlice &CLSlice::operator=(CLSlice &&) = default; CLSlice::~CLSlice() = default; -Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status CLSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { return experimental::CLSlice::validate(input, output, starts, ends); } @@ -89,7 +96,11 @@ void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordin configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends); } -void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends) +void CLSlice::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends) { _impl->src = input; _impl->dst = output; diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp index d52352fc8d..2e70e2aa08 100644 --- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp +++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp @@ -22,12 +22,14 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h" + #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClSoftmaxKernel.h" #include "src/gpu/cl/operators/ClPermute.h" @@ -40,9 +42,9 @@ using OperatorType = opencl::ClSoftmax; template struct CLSoftmaxLayerGeneric::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData workspace_tensors{}; @@ -65,28 +67,30 @@ void CLSoftmaxLayerGeneric::configure(const ICLTensor *input, ICLTensor } template -void CLSoftmaxLayerGeneric::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis) +void CLSoftmaxLayerGeneric::configure( + const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis) { _impl->src = input; _impl->dst = output; _impl->op = std::make_unique(); - SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->info()->data_type(), axis }; + SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis}; _impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } template -Status CLSoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) +Status +CLSoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { - SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->data_type(), axis }; + SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis}; return OperatorType::validate(*input, *output, softmax_info); } template -void CLSoftmaxLayerGeneric::run() +void CLSoftmaxLayerGeneric::run() { // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_impl->memory_group); diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp index 3b7083400b..37f728895f 100644 --- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp @@ -29,71 +29,100 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h" namespace arm_compute { CLSpaceToBatchLayer::CLSpaceToBatchLayer() - : _space_to_batch_kernel(std::make_unique()), - _fill(), - _has_padding(false) + : _space_to_batch_kernel(std::make_unique()), _fill(), _has_padding(false) { } CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default; -void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output); } -void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill.configure(compile_context, output, + PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output); } -void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, + padding_right, output); } -void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, - const Size2D &padding_right, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill.configure(compile_context, output, + PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output); + _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, + output); } -Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); return Status{}; } -Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); - ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -101,7 +130,7 @@ Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s void CLSpaceToBatchLayer::run() { // Zero out output only if we have paddings - if(_has_padding) + if (_has_padding) { //CLScheduler::get().enqueue(*_fill, true); _fill.run(); diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp index 67dafff47f..22695c9ef3 100644 --- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp @@ -29,14 +29,13 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h" namespace arm_compute { -CLSpaceToDepthLayer::CLSpaceToDepthLayer() - : _space_to_depth_kernel(std::make_unique()) +CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique()) { } @@ -47,7 +46,10 @@ void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, i configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); _space_to_depth_kernel->configure(compile_context, input, output, block_shape); diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp index 0b27371e3f..6be43cc5cd 100644 --- a/src/runtime/CL/functions/CLSplit.cpp +++ b/src/runtime/CL/functions/CLSplit.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/helpers/AutoConfiguration.h" namespace arm_compute @@ -38,7 +39,7 @@ void CLSplit::run() { cl::CommandQueue q = CLScheduler::get().queue(); - for(unsigned i = 0; i < _num_outputs; ++i) + for (unsigned i = 0; i < _num_outputs; ++i) { _slice_functions[i].run(); } diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp index 6a335da00c..c15496fc31 100644 --- a/src/runtime/CL/functions/CLStackLayer.cpp +++ b/src/runtime/CL/functions/CLStackLayer.cpp @@ -21,8 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include - #include "arm_compute/runtime/CL/functions/CLStackLayer.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -32,16 +30,16 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLStackLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStackLayerKernel.h" + +#include namespace arm_compute { CLStackLayer::CLStackLayer() // NOLINT - : _input(), - _stack_kernels(), - _num_inputs(0) + : _input(), _stack_kernels(), _num_inputs(0) { } @@ -52,7 +50,10 @@ void CLStackLayer::configure(const std::vector &input, int axis, IC configure(CLKernelLibrary::get().get_compile_context(), input, axis, output); } -void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector &input, int axis, ICLTensor *output) +void CLStackLayer::configure(const CLCompileContext &compile_context, + const std::vector &input, + int axis, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input, axis, output); _num_inputs = input.size(); @@ -61,7 +62,7 @@ void CLStackLayer::configure(const CLCompileContext &compile_context, const std: // Wrap around negative values const unsigned int axis_u = wrap_around(axis, static_cast(input[0]->info()->num_dimensions() + 1)); - for(unsigned int i = 0; i < _num_inputs; i++) + for (unsigned int i = 0; i < _num_inputs; i++) { _stack_kernels.emplace_back(std::make_unique()); _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output); @@ -79,7 +80,7 @@ Status CLStackLayer::validate(const std::vector &input, int axis, const unsigned int num_inputs = input.size(); - for(unsigned int i = 0; i < num_inputs; i++) + for (unsigned int i = 0; i < num_inputs; i++) { // All the tensors must have the same rank ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank); @@ -92,7 +93,7 @@ Status CLStackLayer::validate(const std::vector &input, int axis, void CLStackLayer::run() { - for(unsigned i = 0; i < _num_inputs; i++) + for (unsigned i = 0; i < _num_inputs; i++) { CLScheduler::get().enqueue(*_stack_kernels[i], false); } diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp index 261bdc13d1..c1953cc415 100644 --- a/src/runtime/CL/functions/CLStridedSlice.cpp +++ b/src/runtime/CL/functions/CLStridedSlice.cpp @@ -25,17 +25,23 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLStridedSliceKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStridedSliceKernel.h" namespace arm_compute { namespace experimental { -void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); auto k = std::make_unique(); @@ -43,9 +49,14 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IT _kernel = std::move(k); } -Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); } @@ -53,32 +64,43 @@ Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out struct CLStridedSlice::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - CLRuntimeContext *ctx{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + CLRuntimeContext *ctx{nullptr}; + std::unique_ptr op{nullptr}; }; -CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) - : _impl(std::make_unique()) +CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique()) { _impl->ctx = ctx; } -CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default; +CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default; CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default; CLStridedSlice::~CLStridedSlice() = default; -void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } -void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -86,14 +108,21 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IC _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, + end_mask, shrink_axis_mask); } -Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } void CLStridedSlice::run() diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp index ef790995f9..4f86c4adfa 100644 --- a/src/runtime/CL/functions/CLTile.cpp +++ b/src/runtime/CL/functions/CLTile.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLTile.h" -#include "src/core/CL/kernels/CLTileKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLTileKernel.h" namespace arm_compute { @@ -34,7 +33,10 @@ void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiple configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples); } -void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples) +void CLTile::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Multiples &multiples) { ARM_COMPUTE_LOG_PARAMS(input, output, multiples); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp index e63c92eeb4..5a738f47ce 100644 --- a/src/runtime/CL/functions/CLTranspose.cpp +++ b/src/runtime/CL/functions/CLTranspose.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClTranspose.h" @@ -34,12 +35,11 @@ namespace arm_compute { struct CLTranspose::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLTranspose::CLTranspose() - : _impl(std::make_unique()) +CLTranspose::CLTranspose() : _impl(std::make_unique()) { } CLTranspose::~CLTranspose() = default; @@ -70,4 +70,4 @@ void CLTranspose::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp index 98d47810ab..ddd83e7824 100644 --- a/src/runtime/CL/functions/CLUnstack.cpp +++ b/src/runtime/CL/functions/CLUnstack.cpp @@ -40,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor) return wrap_around(axis, static_cast(tensor->num_dimensions())); } -inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions) +inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, + int32_t &slice_end_mask, + const unsigned int input_num_dimensions) { // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time. Coordinates slice_end; slice_start.set_num_dimensions(input_num_dimensions); slice_end.set_num_dimensions(input_num_dimensions); - for(size_t k = 0; k < input_num_dimensions; ++k) + for (size_t k = 0; k < input_num_dimensions; ++k) { slice_start.set(k, 0); slice_end.set(k, -1); @@ -56,8 +58,7 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t & } // namespace CLUnstack::CLUnstack() // NOLINT - : _num_slices(0), - _strided_slice_vector() + : _num_slices(0), _strided_slice_vector() { } @@ -66,15 +67,19 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis); } -void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector &output_vector, int axis) +void CLUnstack::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const std::vector &output_vector, + int axis) { ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis); std::vector outputs_vector_info(output_vector.size()); - std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); - }); + std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), + [](ICLTensor *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t->info(); + }); ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis)); @@ -87,11 +92,12 @@ void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTens Coordinates slice_start; int32_t slice_end_mask; setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions()); - for(unsigned int slice = 0; slice < _num_slices; ++slice) + for (unsigned int slice = 0; slice < _num_slices; ++slice) { // Adjusts start and end coordinates to take a 2D slice at a time slice_start.set(axis_u, slice); - _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); + _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, (1 << axis_u)); } } @@ -106,18 +112,20 @@ Status CLUnstack::validate(const ITensorInfo *input, const std::vector output_vector.size()); Coordinates slice_start; int32_t slice_end_mask; - for(size_t k = 0; k < num_slices; ++k) + for (size_t k = 0; k < num_slices; ++k) { slice_start.set(wrap_axis(axis, input), k); setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions()); - ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input)))); + ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, + (1 << wrap_axis(axis, input)))); } return Status{}; } void CLUnstack::run() { - for(unsigned i = 0; i < _num_slices; ++i) + for (unsigned i = 0; i < _num_slices; ++i) { _strided_slice_vector[i].run(); } diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index b416d0fcf1..645f817030 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClWinogradConv2d.h" @@ -35,15 +36,15 @@ namespace arm_compute { struct CLWinogradConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; MemoryGroup memory_group{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr memory_manager) @@ -54,15 +55,26 @@ CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptrsrc = input; _impl->weights = weights; @@ -70,20 +82,25 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, enable_fast_math); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, + enable_fast_math); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, _impl->src }, - { TensorType::ACL_SRC_1, _impl->weights }, - { TensorType::ACL_SRC_2, _impl->biases }, - { TensorType::ACL_DST, _impl->dst } - }; - _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src}, + {TensorType::ACL_SRC_1, _impl->weights}, + {TensorType::ACL_SRC_2, _impl->biases}, + {TensorType::ACL_DST, _impl->dst}}; + _impl->workspace_tensors = + manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); } -Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math); } @@ -97,7 +114,7 @@ void CLWinogradConvolutionLayer::run() void CLWinogradConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); @@ -107,4 +124,4 @@ void CLWinogradConvolutionLayer::prepare() _impl->is_prepared = true; } } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp index 18ade97885..4270165ab4 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include @@ -34,8 +35,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -44,109 +44,109 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Default configurations for Bifrost architectures - static std::map gemm_default_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map gemm_default_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G71 configurations - static std::map gemm_g71_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map gemm_g71_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G52 configurations - static std::map gemm_g52_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map gemm_g52_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G76 configurations - static std::map gemm_g76_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map gemm_g76_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; const DataType data_type = params.data_type; - switch(_target) + switch (_target) { case GPUTarget::G71: - if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end()) + if (gemm_g71_configs.find(data_type) != gemm_g71_configs.end()) { - return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G76: - if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end()) + if (gemm_g76_configs.find(data_type) != gemm_g76_configs.end()) { - return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G52: - if(gemm_g52_configs.find(data_type) != gemm_g52_configs.end()) + if (gemm_g52_configs.find(data_type) != gemm_g52_configs.end()) { - return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); default: - if(gemm_default_configs.find(data_type) != gemm_default_configs.end()) + if (gemm_default_configs.find(data_type) != gemm_default_configs.end()) { - return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE; - if(is_rhs_constant) + if (is_rhs_constant) { - if((m > 1) && (n < 16)) + if ((m > 1) && (n < 16)) { gemm_type = CLGEMMKernelType::RESHAPED; } - else if(m == 1) + else if (m == 1) { gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if((k > 256) && (m > 4)) + if ((k > 256) && (m > 4)) { constexpr float alpha = 3.2f; constexpr float fact0 = 1.51f; constexpr float fact1 = 1.66f; constexpr float ops = 12.0f; const float scale = k > 1024 ? 1.07f : 1.0f; - gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::RESHAPED_ONLY_RHS; + gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) + ? CLGEMMKernelType::RESHAPED + : CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { @@ -156,19 +156,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned const auto workload = static_cast((m * n) / 20.0f); - gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED : gemm_type; + gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED + : gemm_type; } return gemm_type; } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -183,11 +185,12 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -197,21 +200,22 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned i } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(k <= 496) + if (k <= 496) { - if(n <= 544) + if (n <= 544) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -222,17 +226,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int } else { - if(k <= 588) + if (k <= 588) { - if(k <= 552) + if (k <= 552) { - if(m <= 148) + if (m <= 148) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 278) + if (m <= 278) { return CLGEMMKernelType::RESHAPED; } @@ -254,16 +258,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -273,13 +278,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int const float r_nk = static_cast(n) / static_cast(k); const float r_mnk = static_cast(m) / (static_cast(n) * static_cast(k)); - if(r_mn <= 1.5469f) + if (r_mn <= 1.5469f) { - if(r_mk <= 0.8766f) + if (r_mk <= 0.8766f) { - if(r_mk <= 0.0211f) + if (r_mk <= 0.0211f) { - if(r_mnk <= 77.5833f) + if (r_mnk <= 77.5833f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -290,7 +295,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_nk <= 0.0832f) + if (r_nk <= 0.0832f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -302,11 +307,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mnk <= 193.0000f) + if (r_mnk <= 193.0000f) { - if(r_mn <= 0.9948f) + if (r_mn <= 0.9948f) { - if(r_mk <= 2.5453f) + if (r_mk <= 2.5453f) { return CLGEMMKernelType::RESHAPED; } @@ -328,17 +333,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mn <= 17.7370f) + if (r_mn <= 17.7370f) { - if(r_mnk <= 1391.2875f) + if (r_mnk <= 1391.2875f) { - if(r_mk <= 2.9724f) + if (r_mk <= 2.9724f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_mnk <= 470.0000f) + if (r_mnk <= 470.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -350,9 +355,9 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_nk <= 0.1381f) + if (r_nk <= 0.1381f) { - if(r_mnk <= 9040.5000f) + if (r_mnk <= 9040.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -363,7 +368,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mn <= 5.6790f) + if (r_mn <= 5.6790f) { return CLGEMMKernelType::RESHAPED; } @@ -381,16 +386,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -398,21 +404,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int const float r_mn = static_cast(m) / static_cast(n); const float r_nk = static_cast(n) / static_cast(k); - if(k <= 212) + if (k <= 212) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_nk <= 0.4990234375f) + if (r_nk <= 0.4990234375f) { - if(k <= 1392) + if (k <= 1392) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 325) + if (m <= 325) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -424,13 +430,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int } else { - if(k <= 471) + if (k <= 471) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_mn <= 0.04475911520421505f) + if (r_mn <= 0.04475911520421505f) { return CLGEMMKernelType::RESHAPED; } @@ -443,37 +449,38 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(n <= 127.0000f) + if (n <= 127.0000f) { - if(n <= 63.5000f) + if (n <= 63.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 3616.0000f) + if (m <= 3616.0000f) { - if(b <= 18.5000f) + if (b <= 18.5000f) { - if(m <= 2970.5000f) + if (m <= 2970.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 104.0000f) + if (k <= 104.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -496,19 +503,19 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } else { - if(m <= 12.5000f) + if (m <= 12.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 104.0000f) + if (k <= 104.0000f) { - if(b <= 18.5000f) + if (b <= 18.5000f) { - if(m <= 490.0000f) + if (m <= 490.0000f) { - if(n <= 272.0000f) + if (n <= 272.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -529,11 +536,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } else { - if(m <= 226.0000f) + if (m <= 226.0000f) { - if(n <= 140.0000f) + if (n <= 140.0000f) { - if(m <= 179.5000f) + if (m <= 179.5000f) { return CLGEMMKernelType::RESHAPED; } @@ -556,15 +563,16 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); ARM_COMPUTE_UNUSED(n); ARM_COMPUTE_UNUSED(k); - if(is_rhs_constant) + if (is_rhs_constant) { - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp index ef30b28f96..673038a8db 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include @@ -35,8 +36,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -45,22 +45,21 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Configurations for Midgard architectures - static std::map gemm_configs = - { - { DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8 } - }; + static std::map gemm_configs = { + {DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8}}; const DataType data_type = params.data_type; - if(gemm_configs.find(data_type) != gemm_configs.end()) + if (gemm_configs.find(data_type) != gemm_configs.end()) { return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); } @@ -68,7 +67,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec ARM_COMPUTE_ERROR("Not supported data type"); } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); @@ -76,7 +76,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); @@ -84,7 +85,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant); diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp index 9e779d3752..851e23bc84 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include @@ -34,8 +35,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -44,135 +44,136 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Default configurations for Valhall architectures - static std::map gemm_default_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map gemm_default_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G77 configurations - static std::map gemm_g77_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map gemm_g77_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G78 configurations - static std::map gemm_g78_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map gemm_g78_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G710 and Mali-G610 configurations - static std::map gemm_g710_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map gemm_g710_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G715 and Mali-G615 configurations - static std::map gemm_g715_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map gemm_g715_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; const DataType data_type = params.data_type; - switch(_target) + switch (_target) { case GPUTarget::G710: case GPUTarget::G610: - if(gemm_g710_configs.find(data_type) != gemm_g710_configs.end()) + if (gemm_g710_configs.find(data_type) != gemm_g710_configs.end()) { - return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G715: case GPUTarget::G615: - if(gemm_g715_configs.find(data_type) != gemm_g715_configs.end()) + if (gemm_g715_configs.find(data_type) != gemm_g715_configs.end()) { - return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G78: - if(gemm_g78_configs.find(data_type) != gemm_g78_configs.end()) + if (gemm_g78_configs.find(data_type) != gemm_g78_configs.end()) { - return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G77: - if(gemm_g77_configs.find(data_type) != gemm_g77_configs.end()) + if (gemm_g77_configs.find(data_type) != gemm_g77_configs.end()) { - return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); default: - if(gemm_default_configs.find(data_type) != gemm_default_configs.end()) + if (gemm_default_configs.find(data_type) != gemm_default_configs.end()) { - return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -182,47 +183,48 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned i } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(n <= 272.0000f) + if (n <= 272.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 471.0000f) + if (k <= 471.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 72.5000f) + if (m <= 72.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 90.5000f) + if (m <= 90.5000f) { return CLGEMMKernelType::RESHAPED; } else { - if(k <= 2448.0000f) + if (k <= 2448.0000f) { - if(n <= 756.0000f) + if (n <= 756.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -241,11 +243,12 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } @@ -253,9 +256,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int return CLGEMMKernelType::RESHAPED_ONLY_RHS; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - if(!is_rhs_constant) + if (!is_rhs_constant) { return default_f32(m, n, k, b, is_rhs_constant); } @@ -263,7 +267,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int unsigned int best_m0; unsigned int best_n0; - if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) + if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) { return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL; } @@ -273,9 +277,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - if(!is_rhs_constant) + if (!is_rhs_constant) { return g78_f16(m, n, k, b, is_rhs_constant); } @@ -283,7 +288,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int unsigned int best_m0; unsigned int best_n0; - if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) + if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) { return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL; } diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h index 6189a324cf..c528dbcac4 100644 --- a/src/runtime/CL/gemm/CLGEMMKernelSelection.h +++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h @@ -25,6 +25,7 @@ #define SRC_CLGEMMKERNELSELECTION_H #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" + #include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h" #include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h" #include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h" @@ -45,7 +46,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: return std::make_unique(gpu); diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp index b06c3b0f8e..8df57197e2 100644 --- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp +++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" #include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h" @@ -51,13 +52,15 @@ GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_ bool valid = false; CLGEMMKernelType gemm_type{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", to_string(gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", + to_string(gemm_type).c_str()); } else { @@ -87,10 +90,11 @@ GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery { GEMMLHSMatrixInfo lhs_info; GEMMRHSMatrixInfo rhs_info; - std::unique_ptr gemm_config = ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target); + std::unique_ptr gemm_config = + ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query) @@ -100,32 +104,36 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &qu GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigReshapedOnlyRHS config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter - std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, - config.export_cl_image); + std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, + !config.transpose_rhs, config.transpose_rhs, config.export_cl_image); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query) { GEMMLHSMatrixInfo lhs_info; GEMMRHSMatrixInfo rhs_info; - std::unique_ptr gemm_config = ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target); + std::unique_ptr gemm_config = + ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query) @@ -135,21 +143,24 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query) GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigReshaped config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); - std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, config.interleave_rhs, !config.transpose_rhs, - config.transpose_rhs, config.export_cl_image); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); + std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, + config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, config.export_cl_image); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query) @@ -159,7 +170,7 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query) std::unique_ptr gemm_config = ClGemmNativeKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query) @@ -169,23 +180,26 @@ GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query) GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigNative config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter - std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false); + std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } } // namespace auto_heuristics } // namespace cl_gemm -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h index 020237b7f4..f544715e03 100644 --- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h +++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h @@ -50,8 +50,7 @@ struct CommonQuery /** Result of querying about GEMM type ( @ref CLGEMMKernelType) */ struct GEMMTypeResult { - GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) - : valid{ valid }, gemm_type{ gemm_type } + GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) : valid{valid}, gemm_type{gemm_type} { } /** Test if the result is valid */ @@ -67,7 +66,7 @@ struct GEMMTypeResult struct GEMMConfigResult { GEMMConfigResult(bool valid, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info) - : valid{ valid }, lhs_info{ lhs_info }, rhs_info{ rhs_info } + : valid{valid}, lhs_info{lhs_info}, rhs_info{rhs_info} { } /** Test if the result is valid */ @@ -134,4 +133,4 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query); } // namespace cl_gemm } // namespace arm_compute -#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H \ No newline at end of file +#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H diff --git a/src/runtime/CL/mlgo/Common.h b/src/runtime/CL/mlgo/Common.h index c451bd9062..08a7ee8c18 100644 --- a/src/runtime/CL/mlgo/Common.h +++ b/src/runtime/CL/mlgo/Common.h @@ -45,37 +45,37 @@ using GEMMType = CLGEMMKernelType; /** GEMM Configuration for Native kernel */ struct GEMMConfigNative { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ }; /** GEMM Configuration for Reshaped Only RHS kernel */ struct GEMMConfigReshapedOnlyRHS { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ - unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ - bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */ - bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ + bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */ + bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ }; /** GEMM Configuration for Reshaped kernel */ struct GEMMConfigReshaped { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ - unsigned int v0{ 1 }; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ - unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_lhs{ false }; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */ - bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ - bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */ - bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ + unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_lhs{false}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */ + bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ + bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */ + bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ }; } // namespace mlgo } // namespace arm_compute -#endif // SRC_RUNTIME_CL_MLGO_COMMON_H \ No newline at end of file +#endif // SRC_RUNTIME_CL_MLGO_COMMON_H diff --git a/src/runtime/CL/mlgo/HeuristicTree.cpp b/src/runtime/CL/mlgo/HeuristicTree.cpp index 1c75cdc427..f7b706902b 100644 --- a/src/runtime/CL/mlgo/HeuristicTree.cpp +++ b/src/runtime/CL/mlgo/HeuristicTree.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/runtime/CL/mlgo/HeuristicTree.h" + #include "arm_compute/core/Log.h" #include "support/Cast.h" @@ -40,27 +41,23 @@ bool evaluate(GEMMShape shape, Condition cond) // PRE: all features and ConditionalOps are valid constexpr float eps = 0.0001f; // Calculate all secondary features - std::vector> cond_values - { - { "m", static_cast(shape.m) }, - { "n", static_cast(shape.n) }, - { "k", static_cast(shape.k) }, - { "b", static_cast(shape.b) }, - { "r_mn", static_cast(shape.m) / shape.n }, - { "r_mk", static_cast(shape.m) / shape.k }, - { "r_nk", static_cast(shape.n) / shape.k }, - { "r_mnk", static_cast(shape.m) / (static_cast(shape.n) / shape.k) }, - { "workload", (static_cast(shape.m) * shape.n * shape.b) / 20.0 } - }; - auto cond_value_pair_it = std::find_if(cond_values.begin(), cond_values.end(), - [&cond](decltype(*cond_values.begin()) it) - { - return it.first == cond.feature; - }); + std::vector> cond_values{ + {"m", static_cast(shape.m)}, + {"n", static_cast(shape.n)}, + {"k", static_cast(shape.k)}, + {"b", static_cast(shape.b)}, + {"r_mn", static_cast(shape.m) / shape.n}, + {"r_mk", static_cast(shape.m) / shape.k}, + {"r_nk", static_cast(shape.n) / shape.k}, + {"r_mnk", static_cast(shape.m) / (static_cast(shape.n) / shape.k)}, + {"workload", (static_cast(shape.m) * shape.n * shape.b) / 20.0}}; + auto cond_value_pair_it = + std::find_if(cond_values.begin(), cond_values.end(), + [&cond](decltype(*cond_values.begin()) it) { return it.first == cond.feature; }); ARM_COMPUTE_ERROR_ON(cond_value_pair_it == cond_values.end()); const float cond_value = cond_value_pair_it->second; - switch(cond.op) + switch (cond.op) { case ConditionalOp::LT: { @@ -92,13 +89,12 @@ constexpr size_t HeuristicTree::_max_num_nodes; constexpr size_t HeuristicTree::_max_query_depth; constexpr HeuristicTree::NodeID HeuristicTree::_root; -HeuristicTree::HeuristicTree() - : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32) +HeuristicTree::HeuristicTree() : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32) { } HeuristicTree::HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type) - : _id{ id }, _heuristic_type{ h_type }, _ip_target{ ip_target }, _data_type{ data_type }, _tree{} + : _id{id}, _heuristic_type{h_type}, _ip_target{ip_target}, _data_type{data_type}, _tree{} { } @@ -108,16 +104,17 @@ std::pair HeuristicTree::query(GEMMShape shape) const // Root ID = 0; auto cur_node = _tree.at(_root).get(); size_t depth = 0; - while(cur_node->type() != NodeType::Leaf) + while (cur_node->type() != NodeType::Leaf) { - if(depth > _max_query_depth) + if (depth > _max_query_depth) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", _max_query_depth); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", + _max_query_depth); return std::make_pair(false, T{}); } ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Branch, "Unexpected NodeType"); auto br_node = utils::cast::polymorphic_downcast(cur_node); - if(evaluate(shape, br_node->condition)) + if (evaluate(shape, br_node->condition)) { cur_node = _tree.at(br_node->true_node).get(); } @@ -135,12 +132,12 @@ std::pair HeuristicTree::query(GEMMShape shape) const template bool HeuristicTree::add_leaf(NodeID id, T val) { - if(_tree.size() >= _max_num_nodes) + if (_tree.size() >= _max_num_nodes) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes); return false; } - if(_tree.find(id) != _tree.end()) + if (_tree.find(id) != _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id); return false; @@ -151,28 +148,23 @@ bool HeuristicTree::add_leaf(NodeID id, T val) bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID f_node) { - if(_tree.size() >= _max_num_nodes) + if (_tree.size() >= _max_num_nodes) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes); return false; } - const std::set supported_features = - { - "m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload" - }; - const auto orig_feature = cond.feature; - std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), [](char c) - { - return std::tolower(c); - }); - if(supported_features.find(cond.feature) == supported_features.end()) + const std::set supported_features = {"m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"}; + const auto orig_feature = cond.feature; + std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), + [](char c) { return std::tolower(c); }); + if (supported_features.find(cond.feature) == supported_features.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Unsupported feature %s", orig_feature.c_str()); return false; } - if(_tree.find(id) != _tree.end()) + if (_tree.find(id) != _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id); return false; @@ -184,32 +176,32 @@ bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID bool HeuristicTree::check_if_structurally_correct() const { std::set visited; - std::deque to_visit{ _root }; + std::deque to_visit{_root}; - while(!to_visit.empty()) + while (!to_visit.empty()) { auto id = to_visit.front(); to_visit.pop_front(); - if(_tree.find(id) == _tree.end()) + if (_tree.find(id) == _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing node %zu", id); return false; } auto not_seen_before = visited.insert(id); - if(!not_seen_before.second) + if (!not_seen_before.second) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Not a tree; contains cycles or loops"); return false; } auto cur_node = _tree.at(id).get(); - if(cur_node->type() == NodeType::Branch) + if (cur_node->type() == NodeType::Branch) { auto br_node = utils::cast::polymorphic_downcast(cur_node); to_visit.push_back(br_node->true_node); to_visit.push_back(br_node->false_node); } } - if(visited.size() != _tree.size()) + if (visited.size() != _tree.size()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Contains disjoint nodes"); return false; @@ -219,12 +211,12 @@ bool HeuristicTree::check_if_structurally_correct() const bool HeuristicTree::check() { - if(_tree.empty()) + if (_tree.empty()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Empty tree encountered"); return false; } - if(_tree.find(_root) == _tree.end()) + if (_tree.find(_root) == _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing root. Root must have a Node ID of %zu", _root); return false; @@ -237,7 +229,8 @@ template std::pair HeuristicTree::query(GEMMShape shap /** Explicit template instantiation @relates HeuristicTree */ template std::pair HeuristicTree::query(GEMMShape shape) const; /** Explicit template instantiation @relates HeuristicTree */ -template std::pair HeuristicTree::query(GEMMShape shape) const; +template std::pair +HeuristicTree::query(GEMMShape shape) const; /** Explicit template instantiation @relates HeuristicTree */ template std::pair HeuristicTree::query(GEMMShape shape) const; diff --git a/src/runtime/CL/mlgo/HeuristicTree.h b/src/runtime/CL/mlgo/HeuristicTree.h index d5c7de2215..a4f8c116b9 100644 --- a/src/runtime/CL/mlgo/HeuristicTree.h +++ b/src/runtime/CL/mlgo/HeuristicTree.h @@ -25,6 +25,7 @@ #define SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H #include "arm_compute/core/Types.h" + #include "src/runtime/CL/mlgo/Common.h" #include @@ -84,7 +85,7 @@ public: struct BranchNode : public Node { BranchNode(NodeID id, Condition cond, NodeID t_node, NodeID f_node) - : id{ id }, condition{ cond }, true_node{ t_node }, false_node{ f_node } + : id{id}, condition{cond}, true_node{t_node}, false_node{f_node} { } NodeType type() const override @@ -100,8 +101,7 @@ public: template struct LeafNode : public Node { - LeafNode(NodeID id, T val) - : id{ id }, value{ val } + LeafNode(NodeID id, T val) : id{id}, value{val} { } NodeType type() const override @@ -177,22 +177,22 @@ public: bool check(); private: - static constexpr size_t _max_query_depth{ 1000 }; // Maximum depth of query - static constexpr size_t _max_num_nodes{ 100000 }; // Maximum number of nodes contained by the tree - static constexpr NodeID _root{ 0 }; // Root tree ID + static constexpr size_t _max_query_depth{1000}; // Maximum depth of query + static constexpr size_t _max_num_nodes{100000}; // Maximum number of nodes contained by the tree + static constexpr NodeID _root{0}; // Root tree ID private: bool check_if_structurally_correct() const; private: - TreeID _id; /**< Heuristic tree ID */ - HeuristicType _heuristic_type; /**< Heuristic type */ - std::string _ip_target; /**< IP target associated with the tree */ - DataType _data_type; /**< Data type associated with the tree */ - std::map> _tree; /**< Tree representation */ + TreeID _id; /**< Heuristic tree ID */ + HeuristicType _heuristic_type; /**< Heuristic type */ + std::string _ip_target; /**< IP target associated with the tree */ + DataType _data_type; /**< Data type associated with the tree */ + std::map> _tree; /**< Tree representation */ }; } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H \ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.cpp b/src/runtime/CL/mlgo/MLGOHeuristics.cpp index 80f3bb85e9..aed46cd80f 100644 --- a/src/runtime/CL/mlgo/MLGOHeuristics.cpp +++ b/src/runtime/CL/mlgo/MLGOHeuristics.cpp @@ -24,6 +24,7 @@ #include "src/runtime/CL/mlgo/MLGOHeuristics.h" #include "arm_compute/core/Log.h" + #include "src/runtime/CL/mlgo/MLGOParser.h" #include "src/runtime/CL/mlgo/Utils.h" @@ -39,19 +40,19 @@ bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs) } bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs) { - return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, - rhs.export_cl_image); + return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == + std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); } bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs) { - return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, - rhs.interleave_lhs, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); + return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, + lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, rhs.interleave_lhs, + rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); } constexpr size_t MLGOHeuristics::_max_num_trees; -MLGOHeuristics::MLGOHeuristics() - : _indices{}, _trees{}, _tree_valid{}, _valid{ false } +MLGOHeuristics::MLGOHeuristics() : _indices{}, _trees{}, _tree_valid{}, _valid{false} { } @@ -59,71 +60,74 @@ std::pair MLGOHeuristics::query_gemm_type(const Query &query) co { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm type. %s.", to_string(query).c_str()); const auto invalid = GEMMType::RESHAPED; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Type, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query(shape_query); } std::pair MLGOHeuristics::query_gemm_config_native(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigNative{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Native, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query(shape_query); } std::pair MLGOHeuristics::query_gemm_config_reshaped_only_rhs(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigReshapedOnlyRHS{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped_Only_RHS, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query(shape_query); } std::pair MLGOHeuristics::query_gemm_config_reshaped(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigReshaped{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query(shape_query); } @@ -131,14 +135,14 @@ std::pair MLGOHeuristics::query_gemm_config_reshaped(c bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id) { bool status; - HeuristicTree *tree{ nullptr }; + HeuristicTree *tree{nullptr}; std::tie(status, tree) = get_heuristic_tree(id); - if(!status) + if (!status) { return status; } status = tree->check(); - if(!status) + if (!status) { return status; } @@ -149,14 +153,12 @@ bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id) bool MLGOHeuristics::check_all() const { // Tree validities are already checked and cached. - bool all_trees_are_checked = std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) - { - return !v.second; - }) - == _tree_valid.end(); - if(!all_trees_are_checked) + bool all_trees_are_checked = + std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) { return !v.second; }) == _tree_valid.end(); + if (!all_trees_are_checked) { - ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each tree is completed. This could also indicate there are no trees in the dotmlgo"); + ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each " + "tree is completed. This could also indicate there are no trees in the dotmlgo"); return false; } @@ -167,14 +169,14 @@ bool MLGOHeuristics::check_all() const std::pair MLGOHeuristics::get_heuristic_tree(HeuristicTree::TreeID id) { - if(_indices.find(id) == _indices.end()) + if (_indices.find(id) == _indices.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot find tree with id %zu", id); return std::make_pair(false, nullptr); } const auto index = _indices[id]; - if(_trees.find(index) == _trees.end()) + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); return std::make_pair(false, nullptr); @@ -186,7 +188,7 @@ std::pair MLGOHeuristics::get_heuristic_tree(HeuristicTre bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) { - if(_indices.size() >= _max_num_trees) + if (_indices.size() >= _max_num_trees) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the max number of trees allowed: %zu", _max_num_trees); return false; @@ -194,7 +196,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) // PRE: correctness of t is guaranteed by the tree construction process // Ensure unique id const auto id = t.id(); - if(_indices.find(id) != _indices.end()) + if (_indices.find(id) != _indices.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add redundant trees; tree id %zu already exists", id); return false; @@ -202,7 +204,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) // Ensure unique index const auto index = t.index(); - if(_trees.find(index) != _trees.end()) + if (_trees.find(index) != _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot add redundant trees; tree index already exists"); return false; @@ -219,9 +221,10 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename) std::ifstream fs; fs.exceptions(std::ifstream::badbit); fs.open(filename, std::ios::in); - if(!fs.is_open()) + if (!fs.is_open()) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", filename.c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", + filename.c_str()); return _valid = false; } return reload_from_stream(fs); @@ -230,7 +233,7 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename) bool MLGOHeuristics::reload_from_stream(std::istream &in) { auto parsed = parser::parse_mlgo(in); - if(!parsed.first) + if (!parsed.first) { ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO parsing failed. Use default heuristics instead"); return _valid = false; @@ -241,4 +244,4 @@ bool MLGOHeuristics::reload_from_stream(std::istream &in) } } // namespace mlgo -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.h b/src/runtime/CL/mlgo/MLGOHeuristics.h index aa21225959..6a491c5503 100644 --- a/src/runtime/CL/mlgo/MLGOHeuristics.h +++ b/src/runtime/CL/mlgo/MLGOHeuristics.h @@ -135,16 +135,16 @@ public: bool check_all() const; private: - static constexpr size_t _max_num_trees{ 100 }; /**< Max number of trees that can be added*/ + static constexpr size_t _max_num_trees{100}; /**< Max number of trees that can be added*/ private: // There exists a one-to-one mappipng between TreeID and Index, either can be used to identify a @ref HeuristicTree std::map _indices; /**< A mapping from TreeID to Index */ std::map _trees; /**< A mapping from Index to HeuristicTree */ std::map _tree_valid; /**< Result cache of the tree validity checks */ - bool _valid; /**< Overall validity */ + bool _valid; /**< Overall validity */ }; } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H \ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H diff --git a/src/runtime/CL/mlgo/MLGOParser.cpp b/src/runtime/CL/mlgo/MLGOParser.cpp index 625739e450..893daf2ed9 100644 --- a/src/runtime/CL/mlgo/MLGOParser.cpp +++ b/src/runtime/CL/mlgo/MLGOParser.cpp @@ -22,19 +22,21 @@ * SOFTWARE. */ #include "src/runtime/CL/mlgo/MLGOParser.h" + #include "arm_compute/core/Log.h" + #include "src/runtime/CL/mlgo/Utils.h" #include #define CHECK(parser_expr, valid_var) \ (parser_expr); \ - if(!valid_var) \ + if (!valid_var) \ return; #define CHECK_DEFAULT(parser_expr, valid_var, default_val) \ (parser_expr); \ - if(!valid_var) \ + if (!valid_var) \ return default_val; #ifdef ARM_COMPUTE_LOGGING_ENABLED @@ -53,8 +55,7 @@ valid_var = false; \ return default_val; -#define LOG_TOKEN_POS(tokens, pos_var) \ - const auto pos_var = tokens.current_pos(); +#define LOG_TOKEN_POS(tokens, pos_var) const auto pos_var = tokens.current_pos(); #else // ARM_COMPUTE_LOGGING_ENABLED @@ -73,19 +74,12 @@ namespace { void ltrim(std::string &str) { - str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) - { - return !std::isspace(ch); - })); + str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch); })); } void rtrim(std::string &str) { - str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) - { - return !std::isspace(ch); - }).base(), - str.end()); + str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch); }).base(), str.end()); } void trim(std::string &str) @@ -109,7 +103,7 @@ enum class ComparatorType }; TokenStream::TokenStream(std::istream &s, const std::string &delims) - : _delims{ delims }, _istream{ s }, _tokens{}, _lookahead_pos{} + : _delims{delims}, _istream{s}, _tokens{}, _lookahead_pos{} { read(); } @@ -125,7 +119,7 @@ Token TokenStream::take() ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty"); Token t = _tokens.front(); _tokens.pop_front(); - if(_tokens.empty()) + if (_tokens.empty()) { read(); } @@ -136,7 +130,7 @@ Token TokenStream::peek(size_t i) ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty"); ARM_COMPUTE_ERROR_ON_MSG(i >= max_look_ahead, "TokenStream: Exceeding max look ahead"); // NOTE: If i exceeds the stream (_istream.eof()), read() automatically appends a End token at the end - while(_istream && _tokens.size() <= i) + while (_istream && _tokens.size() <= i) { read(); } @@ -146,7 +140,7 @@ Token TokenStream::peek(size_t i) void advance(CharPosition &pos, char ch) { - if(ch == '\n') + if (ch == '\n') { pos.ln += 1; pos.col = 0; @@ -167,17 +161,16 @@ void TokenStream::read() do { // Reached eof - if(!_istream.get(ch)) + if (!_istream.get(ch)) { - if(!reached_end()) + if (!reached_end()) { _tokens.emplace_back(TokenType::End, "", _lookahead_pos); } return; } advance(_lookahead_pos, ch); - } - while(std::isspace(ch) || is_delim(ch)); + } while (std::isspace(ch) || is_delim(ch)); // Read chars until we hit a delim or eof auto orig_pos = _lookahead_pos; auto tok = recognize_tok(ch); @@ -190,41 +183,41 @@ void TokenStream::read() Token TokenStream::recognize_tok(char ch) { - if(ch == '[') + if (ch == '[') { - return Token{ TokenType::L_List, "", _lookahead_pos }; + return Token{TokenType::L_List, "", _lookahead_pos}; } - else if(ch == ']') + else if (ch == ']') { - return Token{ TokenType::R_List, "", _lookahead_pos }; + return Token{TokenType::R_List, "", _lookahead_pos}; } - else if(ch == '.') + else if (ch == '.') { - return float_after_dp_st(std::string{ ch }); + return float_after_dp_st(std::string{ch}); } - else if(std::isdigit(ch)) + else if (std::isdigit(ch)) { - return num_st(std::string{ ch }); + return num_st(std::string{ch}); } else { - return text_st(std::string{ ch }); + return text_st(std::string{ch}); } } Token TokenStream::num_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(ch == '.') + if (ch == '.') { return float_after_dp_st(value + ch); } - else if(!std::isdigit(ch)) + else if (!std::isdigit(ch)) { - if(!is_delim(ch) && !std::isspace(ch)) + if (!is_delim(ch) && !std::isspace(ch)) { rewind(_lookahead_pos); _istream.unget(); @@ -233,18 +226,18 @@ Token TokenStream::num_st(std::string value) } value += ch; } - return Token{ TokenType::Int, value, _lookahead_pos }; + return Token{TokenType::Int, value, _lookahead_pos}; } Token TokenStream::float_after_dp_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(!std::isdigit(ch)) + if (!std::isdigit(ch)) { - if(!is_delim(ch) && !std::isspace(ch)) + if (!is_delim(ch) && !std::isspace(ch)) { rewind(_lookahead_pos); _istream.unget(); @@ -253,20 +246,20 @@ Token TokenStream::float_after_dp_st(std::string value) } value += ch; } - return Token{ TokenType::Float, value, _lookahead_pos }; + return Token{TokenType::Float, value, _lookahead_pos}; } Token TokenStream::text_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(is_delim(ch)) + if (is_delim(ch)) { break; } - if(ch == '[' || ch == ']') + if (ch == '[' || ch == ']') { rewind(_lookahead_pos); _istream.unget(); @@ -274,7 +267,7 @@ Token TokenStream::text_st(std::string value) } value += ch; } - return Token{ TokenType::Text, value, _lookahead_pos }; + return Token{TokenType::Text, value, _lookahead_pos}; } bool TokenStream::reached_end() const @@ -291,7 +284,7 @@ void end(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::End) + if (tok.type != TokenType::End) { FAIL_WITH_MSG(valid, pos, "Unexpected token at the end of stream"); } @@ -301,7 +294,7 @@ bool bool_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Int) + if (tok.type != TokenType::Int) { FAIL_WITH_MSG_DEFAULT(valid, false, pos, "Expect bool or int token"); } @@ -314,7 +307,7 @@ int int_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Int) + if (tok.type != TokenType::Int) { FAIL_WITH_MSG_DEFAULT(valid, -1, pos, "Expect int token"); } @@ -327,7 +320,7 @@ unsigned int uint_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); int val = CHECK_DEFAULT(int_val(in, valid), valid, 0); - if(val < 0) + if (val < 0) { FAIL_WITH_MSG_DEFAULT(valid, 0, pos, "Expect unsigned int token"); } @@ -338,7 +331,7 @@ float float_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Float) + if (tok.type != TokenType::Float) { FAIL_WITH_MSG_DEFAULT(valid, 0.f, pos, "Expect float token"); } @@ -351,7 +344,7 @@ std::string text_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Text || tok.value.empty()) + if (tok.type != TokenType::Text || tok.value.empty()) { FAIL_WITH_MSG_DEFAULT(valid, "", pos, "Expect a non-empty text token"); } @@ -361,9 +354,9 @@ std::string text_val(TokenStream &in, bool &valid) bool accept_text(TokenStream &in, const std::string &c_str, bool take = true) { auto tok = in.peek(); - if(tok.type == TokenType::Text && tok.value == c_str) + if (tok.type == TokenType::Text && tok.value == c_str) { - if(take) + if (take) { in.take(); } @@ -375,7 +368,7 @@ bool accept_text(TokenStream &in, const std::string &c_str, bool take = true) void expect_text(TokenStream &in, const std::string &str, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_text(in, str)) + if (!accept_text(in, str)) { FAIL_WITH_MSG(valid, pos, std::string("Expect text token: ") + str); } @@ -384,7 +377,7 @@ void expect_text(TokenStream &in, const std::string &str, bool &valid) bool accept_l_list(TokenStream &in) { auto tok = in.peek(); - if(tok.type == TokenType::L_List) + if (tok.type == TokenType::L_List) { in.take(); return true; @@ -395,7 +388,7 @@ bool accept_l_list(TokenStream &in) void expect_l_list(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_l_list(in)) + if (!accept_l_list(in)) { FAIL_WITH_MSG(valid, pos, "Expect '['"); } @@ -404,7 +397,7 @@ void expect_l_list(TokenStream &in, bool &valid) bool accept_r_list(TokenStream &in) { auto tok = in.peek(); - if(tok.type == TokenType::R_List) + if (tok.type == TokenType::R_List) { in.take(); return true; @@ -415,7 +408,7 @@ bool accept_r_list(TokenStream &in) void expect_r_list(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_r_list(in)) + if (!accept_r_list(in)) { FAIL_WITH_MSG(valid, pos, "Expect ']'"); } @@ -424,23 +417,23 @@ void expect_r_list(TokenStream &in, bool &valid) ConditionalOp conditional_op(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "<=")) + if (accept_text(in, "<=")) { return ConditionalOp::LE; } - else if(accept_text(in, ">=")) + else if (accept_text(in, ">=")) { return ConditionalOp::GE; } - else if(accept_text(in, "==")) + else if (accept_text(in, "==")) { return ConditionalOp::EQ; } - else if(accept_text(in, "<")) + else if (accept_text(in, "<")) { return ConditionalOp::LT; } - else if(accept_text(in, ">")) + else if (accept_text(in, ">")) { return ConditionalOp::GT; } @@ -464,11 +457,11 @@ void ip_type(TokenStream &in, bool &valid) { CHECK(expect_text(in, "ip-type", valid), valid); LOG_TOKEN_POS(in, pos); - if(accept_text(in, "gpu")) + if (accept_text(in, "gpu")) { ; } - else if(accept_text(in, "cpu")) + else if (accept_text(in, "cpu")) { ; } @@ -489,15 +482,15 @@ void header(TokenStream &in, bool &valid) DataType data_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "f16")) + if (accept_text(in, "f16")) { return DataType::F16; } - else if(accept_text(in, "f32")) + else if (accept_text(in, "f32")) { return DataType::F32; } - else if(accept_text(in, "qasymm8")) + else if (accept_text(in, "qasymm8")) { return DataType::QASYMM8; } @@ -510,15 +503,15 @@ DataType data_type(TokenStream &in, bool &valid) ComparatorType comparator_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "var")) + if (accept_text(in, "var")) { return ComparatorType::Var; } - else if(accept_text(in, "num")) + else if (accept_text(in, "num")) { return ComparatorType::Num; } - else if(accept_text(in, "enum")) + else if (accept_text(in, "enum")) { return ComparatorType::Enum; } @@ -531,19 +524,19 @@ ComparatorType comparator_type(TokenStream &in, bool &valid) HeuristicType heuristic_type(TokenStream &in, bool &valid, bool take = true) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "gemm-type", take)) + if (accept_text(in, "gemm-type", take)) { return HeuristicType::GEMM_Type; } - else if(accept_text(in, "gemm-config-native", take)) + else if (accept_text(in, "gemm-config-native", take)) { return HeuristicType::GEMM_Config_Native; } - else if(accept_text(in, "gemm-config-reshaped-only-rhs", take)) + else if (accept_text(in, "gemm-config-reshaped-only-rhs", take)) { return HeuristicType::GEMM_Config_Reshaped_Only_RHS; } - else if(accept_text(in, "gemm-config-reshaped", take)) + else if (accept_text(in, "gemm-config-reshaped", take)) { return HeuristicType::GEMM_Config_Reshaped; } @@ -557,7 +550,7 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val { LOG_TOKEN_POS(in, pos); auto ht = CHECK(heuristic_type(in, valid, false), valid); - if(ht != expected_ht) + if (ht != expected_ht) { FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type"); } @@ -567,15 +560,15 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val GEMMType gemm_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "native")) + if (accept_text(in, "native")) { return GEMMType::NATIVE; } - else if(accept_text(in, "reshaped-only-rhs")) + else if (accept_text(in, "reshaped-only-rhs")) { return GEMMType::RESHAPED_ONLY_RHS; } - else if(accept_text(in, "reshaped")) + else if (accept_text(in, "reshaped")) { return GEMMType::RESHAPED; } @@ -593,7 +586,7 @@ GEMMConfigNative gemm_config_native(TokenStream &in, bool &valid) const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigNative{ m0, n0, k0 }; + return GEMMConfigNative{m0, n0, k0}; } GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &valid) @@ -608,7 +601,7 @@ GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &v const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigReshapedOnlyRHS{ m0, n0, k0, h0, ir, tr, ex }; + return GEMMConfigReshapedOnlyRHS{m0, n0, k0, h0, ir, tr, ex}; } GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid) @@ -625,17 +618,17 @@ GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid) const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigReshaped{ m0, n0, k0, v0, h0, il, ir, tr, ex }; + return GEMMConfigReshaped{m0, n0, k0, v0, h0, il, ir, tr, ex}; } void gpu_priority(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "best-performance")) + if (accept_text(in, "best-performance")) { ; } - else if(accept_text(in, "best-memory-usage")) + else if (accept_text(in, "best-memory-usage")) { ; } @@ -648,11 +641,11 @@ void gpu_priority(TokenStream &in, bool &valid) void gpu_behavior(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "static")) + if (accept_text(in, "static")) { ; } - else if(accept_text(in, "dynamic")) + else if (accept_text(in, "dynamic")) { ; } @@ -665,7 +658,7 @@ void gpu_behavior(TokenStream &in, bool &valid) void free_vars(TokenStream &in, bool &valid) { CHECK(expect_l_list(in, valid), valid); - while(!accept_r_list(in)) + while (!accept_r_list(in)) { CHECK(text_val(in, valid), valid); } @@ -688,7 +681,7 @@ void heuristics_table_entry(TokenStream &in, MLGOHeuristics &h, bool &valid) void heuristics_table(TokenStream &in, MLGOHeuristics &h, bool &valid) { CHECK(expect_text(in, "", valid), valid); - while(!accept_text(in, "")) + while (!accept_text(in, "")) { CHECK(heuristics_table_entry(in, h, valid), valid); } @@ -705,11 +698,12 @@ Condition condition(TokenStream &in, bool &valid) const auto c_o = CHECK_DEFAULT(conditional_op(in, valid), valid, invalid_val); const auto r_t = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val); const auto r_v = CHECK_DEFAULT(float_val(in, valid), valid, invalid_val); - if(l_t != ComparatorType::Var || r_t != ComparatorType::Num) + if (l_t != ComparatorType::Var || r_t != ComparatorType::Num) { - FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, "Only accept LHS type to be Var (string) and RHS type to be Num (float)"); + FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, + "Only accept LHS type to be Var (string) and RHS type to be Num (float)"); } - return Condition{ l_v, c_o, r_v }; + return Condition{l_v, c_o, r_v}; } void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) @@ -717,13 +711,13 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) CHECK(expect_text(in, "", valid), valid); - HeuristicTree *t = nullptr; - std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid); + HeuristicTree *t = nullptr; + std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid); const HeuristicType t_heuristic_type = std::get<0>(t->index()); - while(!accept_text(in, "")) + while (!accept_text(in, "")) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "b")) + if (accept_text(in, "b")) { // Branch node const auto id = CHECK(uint_val(in, valid), valid); @@ -732,7 +726,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) const auto f_id = CHECK(uint_val(in, valid), valid); valid = CHECK(t->add_branch(id, cond, t_id, f_id), valid); } - else if(accept_text(in, "l")) + else if (accept_text(in, "l")) { // Leaf node const auto id = CHECK(uint_val(in, valid), valid); @@ -740,7 +734,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) // heuristic table). For now it remains as a step for validation. LOG_TOKEN_POS(in, pos); CHECK(expect_heuristic_type(in, t_heuristic_type, valid), valid); - switch(t_heuristic_type) + switch (t_heuristic_type) { case HeuristicType::GEMM_Type: { @@ -786,7 +780,7 @@ MLGOHeuristics mlgo(TokenStream &in, bool &valid) MLGOHeuristics h; CHECK_DEFAULT(header(in, valid), valid, h); CHECK_DEFAULT(heuristics_table(in, h, valid), valid, h); - while(accept_text(in, " parse_mlgo(std::istream &in) #undef CHECK #undef CHECK_DEFAULT #undef FAIL_WITH_MSG -#undef FAIL_WITH_MSG_DEFAULT \ No newline at end of file +#undef FAIL_WITH_MSG_DEFAULT diff --git a/src/runtime/CL/mlgo/MLGOParser.h b/src/runtime/CL/mlgo/MLGOParser.h index 49d8b9c644..cffce8d6a1 100644 --- a/src/runtime/CL/mlgo/MLGOParser.h +++ b/src/runtime/CL/mlgo/MLGOParser.h @@ -98,15 +98,14 @@ struct CharPosition return ln == other.ln && col == other.col; } - size_t ln{ 0 }; - size_t col{ 0 }; + size_t ln{0}; + size_t col{0}; }; /** Token */ struct Token { - Token(TokenType t, std::string v, CharPosition pos) - : type{ t }, value{ v }, pos{ pos } + Token(TokenType t, std::string v, CharPosition pos) : type{t}, value{v}, pos{pos} { } @@ -196,4 +195,4 @@ std::pair parse_mlgo(std::istream &in); } // namespace parser } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H \ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H diff --git a/src/runtime/CL/mlgo/Utils.cpp b/src/runtime/CL/mlgo/Utils.cpp index 81d418c28e..c7e0100b3c 100644 --- a/src/runtime/CL/mlgo/Utils.cpp +++ b/src/runtime/CL/mlgo/Utils.cpp @@ -43,40 +43,38 @@ inline std::string to_str(const T &val) std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config) { return os << "Native:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "}"; } std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config) { return os << "ReshapedOnlyRHS:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "h0: " << config.h0 << ", " - << "interleave_rhs: " << config.interleave_rhs << ", " - << "transpose_rhs: " << config.transpose_rhs << ", " - << "export_cl_image: " << config.export_cl_image - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "h0: " << config.h0 << ", " + << "interleave_rhs: " << config.interleave_rhs << ", " + << "transpose_rhs: " << config.transpose_rhs << ", " + << "export_cl_image: " << config.export_cl_image << "}"; } std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config) { return os << "Reshaped:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "v0: " << config.v0 << ", " - << "h0: " << config.h0 << ", " - << "interleave_lhs: " << config.interleave_lhs << ", " - << "interleave_rhs: " << config.interleave_rhs << ", " - << "transpose_rhs: " << config.transpose_rhs << ", " - << "export_cl_image: " << config.export_cl_image - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "v0: " << config.v0 << ", " + << "h0: " << config.h0 << ", " + << "interleave_lhs: " << config.interleave_lhs << ", " + << "interleave_rhs: " << config.interleave_rhs << ", " + << "transpose_rhs: " << config.transpose_rhs << ", " + << "export_cl_image: " << config.export_cl_image << "}"; } std::ostream &operator<<(std::ostream &os, HeuristicType ht) { - switch(ht) + switch (ht) { case HeuristicType::GEMM_Type: { @@ -103,7 +101,7 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht) } std::ostream &operator<<(std::ostream &os, DataType dt) { - switch(dt) + switch (dt) { case DataType::F32: { @@ -184,4 +182,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos) } // namespace mlgo -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/mlgo/Utils.h b/src/runtime/CL/mlgo/Utils.h index c634a887e9..73b537f476 100644 --- a/src/runtime/CL/mlgo/Utils.h +++ b/src/runtime/CL/mlgo/Utils.h @@ -43,10 +43,10 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht); std::ostream &operator<<(std::ostream &os, DataType dt); std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index); std::ostream &operator<<(std::ostream &os, const Query &query); -std::string to_string(const GEMMConfigNative &config); -std::string to_string(const GEMMConfigReshapedOnlyRHS &config); -std::string to_string(const GEMMConfigReshaped &config); -std::string to_string(const Query &query); +std::string to_string(const GEMMConfigNative &config); +std::string to_string(const GEMMConfigReshapedOnlyRHS &config); +std::string to_string(const GEMMConfigReshaped &config); +std::string to_string(const Query &query); namespace parser { std::ostream &operator<<(std::ostream &os, const CharPosition &pos); @@ -54,4 +54,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos); } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_UTILS_H \ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_UTILS_H diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp index 6f3e32491a..5e3907f1ea 100644 --- a/src/runtime/CL/tuners/CLTuningParametersList.cpp +++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp @@ -27,20 +27,20 @@ namespace arm_compute { namespace cl_tuner { -constexpr unsigned int max_lws_supported_x{ 64u }; -constexpr unsigned int max_lws_supported_y{ 32u }; -constexpr unsigned int max_lws_supported_z{ 32u }; +constexpr unsigned int max_lws_supported_x{64u}; +constexpr unsigned int max_lws_supported_y{32u}; +constexpr unsigned int max_lws_supported_z{32u}; /** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */ class CLTuningParametersList : public ICLTuningParametersList { protected: /* Shape of 4-D search space */ - TensorShape search_space_shape{ 0, 0, 0, 0 }; - std::vector _lws_x{ 0 }; - std::vector _lws_y{ 0 }; - std::vector _lws_z{ 0 }; - std::vector _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units. + TensorShape search_space_shape{0, 0, 0, 0}; + std::vector _lws_x{0}; + std::vector _lws_y{0}; + std::vector _lws_z{0}; + std::vector _wbsm{0}; /* Modify the batches size of workgroups distributed to compute units. The value is in the range [-31,+31]. When 0, the runtime-selected wbs used is unmodified. */ @@ -116,7 +116,8 @@ private: * @param[in] lws_max Max LWS value allowed to be tested * @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one. */ - void initialize_lws_values(std::vector &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one); + void + initialize_lws_values(std::vector &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one); }; /** A minimal subset of LWS values that only have 1,2 and 4/8 */ @@ -170,9 +171,9 @@ CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDR search_space_shape[1] = lws_y_max; search_space_shape[2] = lws_z_max; search_space_shape[3] = 1; - if(tuning_info.tune_wbsm) + if (tuning_info.tune_wbsm) { - _wbsm = { -3, -2, -1, 0, 1, 2, 3 }; + _wbsm = {-3, -2, -1, 0, 1, 2, 3}; search_space_shape[3] = _wbsm.size(); } } @@ -194,26 +195,31 @@ CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gw _lws_x = {}; _lws_y = {}; _lws_z = {}; - initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 - initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 + initialize_lws_values(_lws_x, gws[0], lws_x_max, + gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 + initialize_lws_values(_lws_y, gws[1], lws_y_max, + gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 initialize_lws_values(_lws_z, gws[2], lws_z_max, false); search_space_shape[0] = _lws_x.size(); search_space_shape[1] = _lws_y.size(); search_space_shape[2] = _lws_z.size(); search_space_shape[3] = 1; - if(tuning_info.tune_wbsm) + if (tuning_info.tune_wbsm) { - _wbsm = { -2, -1, 0, 1, 2 }; + _wbsm = {-2, -1, 0, 1, 2}; search_space_shape[3] = _wbsm.size(); } } -void CLTuningParametersListNormal::initialize_lws_values(std::vector &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one) +void CLTuningParametersListNormal::initialize_lws_values(std::vector &lws, + unsigned int gws, + unsigned int lws_max, + bool mod_let_one) { lws.push_back(1); - for(unsigned int i = 2; i <= lws_max; ++i) + for (unsigned int i = 2; i <= lws_max; ++i) { // Power of two condition const bool is_power_of_two = (i & (i - 1)) == 0; @@ -221,7 +227,7 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws) { - switch(tuning_info.tuner_mode) + switch (tuning_info.tuner_mode) { case CLTunerMode::EXHAUSTIVE: return std::make_unique(gws, tuning_info); diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp index 45e872428f..9fbdc3a4dd 100644 --- a/src/runtime/CPP/CPPScheduler.cpp +++ b/src/runtime/CPP/CPPScheduler.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Log.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/Utility.h" + #include "support/Mutex.h" #include @@ -53,8 +54,7 @@ public: * @param[in] start First value that will be returned by the feeder * @param[in] end End condition (The last value returned by get_next() will be end - 1) */ - explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) - : _atomic_counter(start), _end(end) + explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) : _atomic_counter(start), _end(end) { } /** Return the next element in the range if there is one. @@ -89,8 +89,7 @@ void process_workloads(std::vector &workloads, ThreadFeede { ARM_COMPUTE_ERROR_ON(workload_index >= workloads.size()); workloads[workload_index](info); - } - while(feeder.get_next(workload_index)); + } while (feeder.get_next(workload_index)); } /** Set thread affinity. Pin current thread to a particular core @@ -99,7 +98,7 @@ void process_workloads(std::vector &workloads, ThreadFeede */ void set_thread_affinity(int core_id) { - if(core_id < 0) + if (core_id < 0) { return; } @@ -150,10 +149,10 @@ public: */ explicit Thread(int core_pin = -1); - Thread(const Thread &) = delete; + Thread(const Thread &) = delete; Thread &operator=(const Thread &) = delete; Thread(Thread &&) = delete; - Thread &operator=(Thread &&) = delete; + Thread &operator=(Thread &&) = delete; /** Destructor. Make the thread join. */ ~Thread(); @@ -196,21 +195,20 @@ public: private: std::thread _thread{}; ThreadInfo _info{}; - std::vector *_workloads{ nullptr }; - ThreadFeeder *_feeder{ nullptr }; + std::vector *_workloads{nullptr}; + ThreadFeeder *_feeder{nullptr}; std::mutex _m{}; std::condition_variable _cv{}; - bool _wait_for_work{ false }; - bool _job_complete{ true }; - std::exception_ptr _current_exception{ nullptr }; - int _core_pin{ -1 }; - std::list *_thread_pool{ nullptr }; - unsigned int _wake_beg{ 0 }; - unsigned int _wake_end{ 0 }; + bool _wait_for_work{false}; + bool _job_complete{true}; + std::exception_ptr _current_exception{nullptr}; + int _core_pin{-1}; + std::list *_thread_pool{nullptr}; + unsigned int _wake_beg{0}; + unsigned int _wake_end{0}; }; -Thread::Thread(int core_pin) - : _core_pin(core_pin) +Thread::Thread(int core_pin) : _core_pin(core_pin) { _thread = std::thread(&Thread::worker_thread, this); } @@ -218,7 +216,7 @@ Thread::Thread(int core_pin) Thread::~Thread() { // Make sure worker thread has ended - if(_thread.joinable()) + if (_thread.joinable()) { ThreadFeeder feeder; set_workload(nullptr, feeder, ThreadInfo()); @@ -257,7 +255,7 @@ void Thread::worker_thread() { set_thread_affinity(_core_pin); - while(true) + while (true) { std::unique_lock lock(_m); _cv.wait(lock, [&] { return _wait_for_work; }); @@ -266,18 +264,18 @@ void Thread::worker_thread() _current_exception = nullptr; // Exit if the worker thread has not been fed with workloads - if(_workloads == nullptr || _feeder == nullptr) + if (_workloads == nullptr || _feeder == nullptr) { return; } // Wake up more peer threads from thread pool if this job has been delegated to the current thread - if(_thread_pool != nullptr) + if (_thread_pool != nullptr) { auto thread_it = _thread_pool->begin(); std::advance(thread_it, std::min(static_cast(_thread_pool->size()), _wake_beg)); auto wake_end = std::min(_wake_end, static_cast(_info.num_threads - 1)); - for(unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it) + for (unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it) { thread_it->start(); } @@ -291,7 +289,7 @@ void Thread::worker_thread() #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(...) + catch (...) { _current_exception = std::current_exception(); } @@ -322,11 +320,11 @@ struct CPPScheduler::Impl final : _num_threads(thread_hint), _threads(_num_threads - 1), _mode(Mode::Linear), _wake_fanout(0U) { const auto mode_env_v = utility::tolower(utility::getenv("ARM_COMPUTE_CPP_SCHEDULER_MODE")); - if(mode_env_v == "linear") + if (mode_env_v == "linear") { _forced_mode = ModeToggle::Linear; } - else if(mode_env_v == "fanout") + else if (mode_env_v == "fanout") { _forced_mode = ModeToggle::Fanout; } @@ -350,7 +348,7 @@ struct CPPScheduler::Impl final // Set affinity on worked threads _threads.clear(); - for(auto i = 1U; i < _num_threads; ++i) + for (auto i = 1U; i < _num_threads; ++i) { _threads.emplace_back(func(i, thread_hint)); } @@ -359,20 +357,23 @@ struct CPPScheduler::Impl final void auto_switch_mode(unsigned int num_threads_to_use) { // If the environment variable is set to any of the modes, it overwrites the mode selected over num_threads_to_use - if(_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8)) + if (_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8)) { set_fanout_mode(m_default_wake_fanout, num_threads_to_use); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n", this->wake_fanout(), num_threads_to_use); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n", + this->wake_fanout(), num_threads_to_use); } else // Equivalent to (_forced_mode == ModeToggle::Linear || (_forced_mode == ModeToggle::None && num_threads_to_use <= 8)) { set_linear_mode(); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n", num_threads_to_use); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n", + num_threads_to_use); } } void set_linear_mode() { - for(auto &thread : _threads) + for (auto &thread : _threads) { thread.set_linear_mode(); } @@ -384,14 +385,14 @@ struct CPPScheduler::Impl final ARM_COMPUTE_ERROR_ON(num_threads_to_use > _threads.size() + 1); const auto actual_wake_fanout = std::max(2U, std::min(wake_fanout, num_threads_to_use - 1)); auto thread_it = _threads.begin(); - for(auto i = 1U; i < num_threads_to_use; ++i, ++thread_it) + for (auto i = 1U; i < num_threads_to_use; ++i, ++thread_it) { const auto wake_begin = i * actual_wake_fanout - 1; const auto wake_end = std::min((i + 1) * actual_wake_fanout - 1, num_threads_to_use - 1); thread_it->set_fanout_mode(&_threads, wake_begin, wake_end); } // Reset the remaining threads's wake up schedule - while(thread_it != _threads.end()) + while (thread_it != _threads.end()) { thread_it->set_fanout_mode(&_threads, 0U, 0U); ++thread_it; @@ -417,9 +418,9 @@ struct CPPScheduler::Impl final unsigned int _num_threads; std::list _threads; arm_compute::Mutex _run_workloads_mutex{}; - Mode _mode{ Mode::Linear }; - ModeToggle _forced_mode{ ModeToggle::None }; - unsigned int _wake_fanout{ 0 }; + Mode _mode{Mode::Linear}; + ModeToggle _forced_mode{ModeToggle::None}; + unsigned int _wake_fanout{0}; }; /* @@ -431,8 +432,7 @@ CPPScheduler &CPPScheduler::get() return scheduler; } -CPPScheduler::CPPScheduler() - : _impl(std::make_unique(num_threads_hint())) +CPPScheduler::CPPScheduler() : _impl(std::make_unique(num_threads_hint())) { } @@ -465,15 +465,15 @@ void CPPScheduler::run_workloads(std::vector &workloads) // This is not great because different threads workloads won't run in parallel but at least they // won't interfere each other and deadlock. arm_compute::lock_guard lock(_impl->_run_workloads_mutex); - const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast(workloads.size())); - if(num_threads_to_use < 1) + const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast(workloads.size())); + if (num_threads_to_use < 1) { return; } // Re-adjust the mode if the actual number of threads to use is different from the number of threads created _impl->auto_switch_mode(num_threads_to_use); int num_threads_to_start = 0; - switch(_impl->mode()) + switch (_impl->mode()) { case CPPScheduler::Impl::Mode::Fanout: { @@ -494,22 +494,22 @@ void CPPScheduler::run_workloads(std::vector &workloads) unsigned int t = 0; auto thread_it = _impl->_threads.begin(); // Set num_threads_to_use - 1 workloads to the threads as the remaining 1 is left to the main thread - for(; t < num_threads_to_use - 1; ++t, ++thread_it) + for (; t < num_threads_to_use - 1; ++t, ++thread_it) { info.thread_id = t; thread_it->set_workload(&workloads, feeder, info); } thread_it = _impl->_threads.begin(); - for(int i = 0; i < num_threads_to_start; ++i, ++thread_it) + for (int i = 0; i < num_threads_to_start; ++i, ++thread_it) { thread_it->start(); } - info.thread_id = t; // Set main thread's thread_id + info.thread_id = t; // Set main thread's thread_id std::exception_ptr last_exception = nullptr; #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED try { -#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ +#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ process_workloads(workloads, feeder, info); // Main thread processes workloads #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } @@ -522,7 +522,7 @@ void CPPScheduler::run_workloads(std::vector &workloads) { #endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ thread_it = _impl->_threads.begin(); - for(unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it) + for (unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it) { std::exception_ptr current_exception = thread_it->wait(); if (current_exception) @@ -536,7 +536,7 @@ void CPPScheduler::run_workloads(std::vector &workloads) } #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::system_error &e) + catch (const std::system_error &e) { std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n'; } diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp index 5890553f6f..c46a2731d8 100644 --- a/src/runtime/CPP/SingleThreadScheduler.cpp +++ b/src/runtime/CPP/SingleThreadScheduler.cpp @@ -39,10 +39,10 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints) { const Window &max_window = kernel->window(); - if(hints.split_dimension() != IScheduler::split_dimensions_all) + if (hints.split_dimension() != IScheduler::split_dimensions_all) { const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); - if(num_iterations < 1) + if (num_iterations < 1) { return; } @@ -53,7 +53,10 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints) kernel->run(kernel->window(), info); } -void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) +void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, + const Hints &hints, + const Window &window, + ITensorPack &tensors) { ARM_COMPUTE_UNUSED(hints); ThreadInfo info; @@ -65,7 +68,7 @@ void SingleThreadScheduler::run_workloads(std::vector &workloads) { ThreadInfo info; info.cpu_info = &cpu_info(); - for(auto &wl : workloads) + for (auto &wl : workloads) { wl(info); } diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp index dccbe4045d..94a1673d59 100644 --- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp +++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp @@ -42,28 +42,37 @@ void dequantize_tensor(const ITensor *input, ITensor *output) Iterator input_it(input, window); Iterator output_it(output, window); - switch(data_type) + switch (data_type) { case DataType::QASYMM8: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = dequantize(*reinterpret_cast(input_it.ptr()), qinfo.scale, qinfo.offset); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + dequantize(*reinterpret_cast(input_it.ptr()), qinfo.scale, qinfo.offset); + }, + input_it, output_it); break; case DataType::QASYMM8_SIGNED: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + dequantize_qasymm8_signed(*reinterpret_cast(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM16: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = dequantize(*reinterpret_cast(input_it.ptr()), qinfo.scale, qinfo.offset); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + dequantize(*reinterpret_cast(input_it.ptr()), qinfo.scale, qinfo.offset); + }, + input_it, output_it); break; default: ARM_COMPUTE_ERROR("Unsupported data type"); @@ -80,28 +89,37 @@ void quantize_tensor(const ITensor *input, ITensor *output) Iterator input_it(input, window); Iterator output_it(output, window); - switch(data_type) + switch (data_type) { case DataType::QASYMM8: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + quantize_qasymm8(*reinterpret_cast(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM8_SIGNED: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + quantize_qasymm8_signed(*reinterpret_cast(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM16: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + quantize_qasymm16(*reinterpret_cast(input_it.ptr()), qinfo); + }, + input_it, output_it); break; default: ARM_COMPUTE_ERROR("Unsupported data type"); @@ -132,14 +150,23 @@ CPPBoxWithNonMaximaSuppressionLimit::CPPBoxWithNonMaximaSuppressionLimit(std::sh { } -void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, - ITensor *scores_out, ITensor *boxes_out, ITensor *classes, ITensor *batch_splits_out, - ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info) +void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, + const ITensor *boxes_in, + const ITensor *batch_splits_in, + ITensor *scores_out, + ITensor *boxes_out, + ITensor *classes, + ITensor *batch_splits_out, + ITensor *keeps, + ITensor *keeps_size, + const BoxNMSLimitInfo info) { ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes); - ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info); + ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, + keeps, keeps_size, info); - _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED; + _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || + scores_in->info()->data_type() == DataType::QASYMM8_SIGNED; _scores_in = scores_in; _boxes_in = boxes_in; @@ -150,7 +177,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _batch_splits_out = batch_splits_out; _keeps = keeps; - if(_is_qasymm8) + if (_is_qasymm8) { // Manage intermediate buffers _memory_group.manage(&_scores_in_f32); @@ -160,7 +187,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _memory_group.manage(&_classes_f32); _scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32)); _boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32)); - if(batch_splits_in != nullptr) + if (batch_splits_in != nullptr) { _memory_group.manage(&_batch_splits_in_f32); _batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32)); @@ -168,58 +195,70 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32)); _boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32)); _classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32)); - if(batch_splits_out != nullptr) + if (batch_splits_out != nullptr) { _memory_group.manage(&_batch_splits_out_f32); _batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32)); } - if(keeps != nullptr) + if (keeps != nullptr) { _memory_group.manage(&_keeps_f32); _keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32)); } - _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr, + _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, + (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr, &_scores_out_f32, &_boxes_out_f32, &_classes_f32, - (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr, - keeps_size, info); + (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, + (keeps != nullptr) ? &_keeps_f32 : nullptr, keeps_size, info); } else { - _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info); + _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, + batch_splits_out, keeps, keeps_size, info); } - if(_is_qasymm8) + if (_is_qasymm8) { _scores_in_f32.allocator()->allocate(); _boxes_in_f32.allocator()->allocate(); - if(_batch_splits_in != nullptr) + if (_batch_splits_in != nullptr) { _batch_splits_in_f32.allocator()->allocate(); } _scores_out_f32.allocator()->allocate(); _boxes_out_f32.allocator()->allocate(); _classes_f32.allocator()->allocate(); - if(batch_splits_out != nullptr) + if (batch_splits_out != nullptr) { _batch_splits_out_f32.allocator()->allocate(); } - if(keeps != nullptr) + if (keeps != nullptr) { _keeps_f32.allocator()->allocate(); } } } -Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes, - const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info) +Status validate(const ITensorInfo *scores_in, + const ITensorInfo *boxes_in, + const ITensorInfo *batch_splits_in, + const ITensorInfo *scores_out, + const ITensorInfo *boxes_out, + const ITensorInfo *classes, + const ITensorInfo *batch_splits_out, + const ITensorInfo *keeps, + const ITensorInfo *keeps_size, + const BoxNMSLimitInfo info) { ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); - const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED; - if(is_qasymm8) + const bool is_qasymm8 = + scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED; + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out); @@ -237,11 +276,11 @@ void CPPBoxWithNonMaximaSuppressionLimit::run() // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_memory_group); - if(_is_qasymm8) + if (_is_qasymm8) { dequantize_tensor(_scores_in, &_scores_in_f32); dequantize_tensor(_boxes_in, &_boxes_in_f32); - if(_batch_splits_in != nullptr) + if (_batch_splits_in != nullptr) { dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32); } @@ -249,16 +288,16 @@ void CPPBoxWithNonMaximaSuppressionLimit::run() Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY); - if(_is_qasymm8) + if (_is_qasymm8) { quantize_tensor(&_scores_out_f32, _scores_out); quantize_tensor(&_boxes_out_f32, _boxes_out); quantize_tensor(&_classes_f32, _classes); - if(_batch_splits_out != nullptr) + if (_batch_splits_out != nullptr) { quantize_tensor(&_batch_splits_out_f32, _batch_splits_out); } - if(_keeps != nullptr) + if (_keeps != nullptr) { quantize_tensor(&_keeps_f32, _keeps); } diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp index 41d875eb97..e6291f973e 100644 --- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp +++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp @@ -26,9 +26,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include @@ -36,25 +36,35 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info) +Status validate_arguments(const ITensorInfo *input_loc, + const ITensorInfo *input_conf, + const ITensorInfo *input_priorbox, + const ITensorInfo *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N]."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, + "The priorbox input tensor should be [C3, 2, N]."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1"); const int num_priors = input_priorbox->tensor_shape()[0] / 4; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast((num_priors * info.num_loc_classes() * 4)) != + input_loc->tensor_shape()[0], + "Number of priors must match number of location predictions."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast((num_priors * info.num_classes())) != + input_conf->tensor_shape()[0], + "Number of priors must match number of confidence predictions."); // Validate configured output - if(output->total_size() != 0) + if (output->total_size() != 0) { - const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1); + const unsigned int max_size = + info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output); } @@ -65,8 +75,7 @@ Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input /** Function used to sort pair in descend order based on the score (first) value. */ template -bool SortScorePairDescend(const std::pair &pair1, - const std::pair &pair2) +bool SortScorePairDescend(const std::pair &pair1, const std::pair &pair2) { return pair1.first > pair2.first; } @@ -82,16 +91,19 @@ bool SortScorePairDescend(const std::pair &pair1, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, - const int num_priors, const int num_loc_classes, - const bool share_location, std::vector &all_location_predictions) +void retrieve_all_loc_predictions(const ITensor *input_loc, + const int num, + const int num_priors, + const int num_loc_classes, + const bool share_location, + std::vector &all_location_predictions) { - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_loc_classes; ++c) + for (int c = 0; c < num_loc_classes; ++c) { int label = share_location ? -1 : c; - if(all_location_predictions[i].find(label) == all_location_predictions[i].end()) + if (all_location_predictions[i].find(label) == all_location_predictions[i].end()) { all_location_predictions[i][label].resize(num_priors); } @@ -102,19 +114,23 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, } } } - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int p = 0; p < num_priors; ++p) + for (int p = 0; p < num_priors; ++p) { - for(int c = 0; c < num_loc_classes; ++c) + for (int c = 0; c < num_loc_classes; ++c) { const int label = share_location ? -1 : c; const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4; //xmin, ymin, xmax, ymax - all_location_predictions[i][label][p][0] = *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr))); - all_location_predictions[i][label][p][1] = *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 1))); - all_location_predictions[i][label][p][2] = *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 2))); - all_location_predictions[i][label][p][3] = *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 3))); + all_location_predictions[i][label][p][0] = + *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr))); + all_location_predictions[i][label][p][1] = + *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 1))); + all_location_predictions[i][label][p][2] = + *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 2))); + all_location_predictions[i][label][p][3] = + *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 3))); } } } @@ -130,26 +146,28 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_conf_scores(const ITensor *input_conf, const int num, - const int num_priors, const int num_classes, +void retrieve_all_conf_scores(const ITensor *input_conf, + const int num, + const int num_priors, + const int num_classes, std::vector>> &all_confidence_scores) { std::vector tmp_buffer; tmp_buffer.resize(num * num_priors * num_classes); - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_classes; ++c) + for (int c = 0; c < num_classes; ++c) { - for(int p = 0; p < num_priors; ++p) + for (int p = 0; p < num_priors; ++p) { - tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = - *reinterpret_cast(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c))); + tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = *reinterpret_cast( + input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c))); } } } - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_classes; ++c) + for (int c = 0; c < num_classes; ++c) { all_confidence_scores[i][c].resize(num_priors); all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors], @@ -168,28 +186,23 @@ void retrieve_all_conf_scores(const ITensor *input_conf, const int num, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_priorbox(const ITensor *input_priorbox, - const int num_priors, - std::vector &all_prior_bboxes, +void retrieve_all_priorbox(const ITensor *input_priorbox, + const int num_priors, + std::vector &all_prior_bboxes, std::vector> &all_prior_variances) { - for(int i = 0; i < num_priors; ++i) + for (int i = 0; i < num_priors; ++i) { - all_prior_bboxes[i] = - { - { - *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4))), - *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))), - *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))), - *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3))) - } - }; + all_prior_bboxes[i] = {{*reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4))), + *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))), + *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))), + *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))}}; } - std::array var({ { 0, 0, 0, 0 } }); - for(int i = 0; i < num_priors; ++i) + std::array var({{0, 0, 0, 0}}); + for (int i = 0; i < num_priors; ++i) { - for(int j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { var[j] = *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j))); } @@ -208,13 +221,17 @@ void retrieve_all_priorbox(const ITensor *input_priorbox, * @param[out] decode_bbox The decoded bboxes. * */ -void DecodeBBox(const BBox &prior_bbox, const std::array &prior_variance, - const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target, - const bool clip_bbox, const BBox &bbox, BBox &decode_bbox) +void DecodeBBox(const BBox &prior_bbox, + const std::array &prior_variance, + const DetectionOutputLayerCodeType code_type, + const bool variance_encoded_in_target, + const bool clip_bbox, + const BBox &bbox, + BBox &decode_bbox) { // if the variance is encoded in target, we simply need to add the offset predictions // otherwise we need to scale the offset accordingly. - switch(code_type) + switch (code_type) { case DetectionOutputLayerCodeType::CORNER: { @@ -237,10 +254,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array &prior_varian const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.; const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.; - const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x; - const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y; - const float decode_bbox_width = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width; - const float decode_bbox_height = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height; + const float decode_bbox_center_x = + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x; + const float decode_bbox_center_y = + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y; + const float decode_bbox_width = + (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width; + const float decode_bbox_height = + (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height; decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f); decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f); @@ -258,10 +279,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array &prior_varian ARM_COMPUTE_ERROR_ON(prior_width <= 0.f); ARM_COMPUTE_ERROR_ON(prior_height <= 0.f); - decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width; - decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height; - decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width; - decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height; + decode_bbox[0] = + prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width; + decode_bbox[1] = + prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height; + decode_bbox[2] = + prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width; + decode_bbox[3] = + prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height; break; } @@ -269,9 +294,9 @@ void DecodeBBox(const BBox &prior_bbox, const std::array &prior_varian ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type."); } - if(clip_bbox) + if (clip_bbox) { - for(auto &d_bbox : decode_bbox) + for (auto &d_bbox : decode_bbox) { d_bbox = utility::clamp(d_bbox, 0.f, 1.f); } @@ -289,10 +314,13 @@ void DecodeBBox(const BBox &prior_bbox, const std::array &prior_varian * @param[out] indices The kept indices of bboxes after nms. * */ -void ApplyNMSFast(const std::vector &bboxes, - const std::vector &scores, const float score_threshold, - const float nms_threshold, const float eta, const int top_k, - std::vector &indices) +void ApplyNMSFast(const std::vector &bboxes, + const std::vector &scores, + const float score_threshold, + const float nms_threshold, + const float eta, + const int top_k, + std::vector &indices) { ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size."); @@ -300,9 +328,9 @@ void ApplyNMSFast(const std::vector &bboxes, std::list> score_index_vec; // Generate index score pairs. - for(size_t i = 0; i < scores.size(); ++i) + for (size_t i = 0; i < scores.size(); ++i) { - if(scores[i] > score_threshold) + if (scores[i] > score_threshold) { score_index_vec.emplace_back(std::make_pair(scores[i], i)); } @@ -313,7 +341,7 @@ void ApplyNMSFast(const std::vector &bboxes, // Keep top_k scores if needed. const int score_index_vec_size = score_index_vec.size(); - if(top_k > -1 && top_k < score_index_vec_size) + if (top_k > -1 && top_k < score_index_vec_size) { score_index_vec.resize(top_k); } @@ -322,46 +350,45 @@ void ApplyNMSFast(const std::vector &bboxes, float adaptive_threshold = nms_threshold; indices.clear(); - while(!score_index_vec.empty()) + while (!score_index_vec.empty()) { const int idx = score_index_vec.front().second; bool keep = true; - for(int kept_idx : indices) + for (int kept_idx : indices) { - if(keep) + if (keep) { // Compute the jaccard (intersection over union IoU) overlap between two bboxes. - BBox intersect_bbox = std::array({ 0, 0, 0, 0 }); - if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1]) + BBox intersect_bbox = std::array({0, 0, 0, 0}); + if (bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || + bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1]) { - intersect_bbox = std::array({ { 0, 0, 0, 0 } }); + intersect_bbox = std::array({{0, 0, 0, 0}}); } else { - intersect_bbox = std::array({ { - std::max(bboxes[idx][0], bboxes[kept_idx][0]), - std::max(bboxes[idx][1], bboxes[kept_idx][1]), - std::min(bboxes[idx][2], bboxes[kept_idx][2]), - std::min(bboxes[idx][3], bboxes[kept_idx][3]) - } - }); + intersect_bbox = std::array( + {{std::max(bboxes[idx][0], bboxes[kept_idx][0]), std::max(bboxes[idx][1], bboxes[kept_idx][1]), + std::min(bboxes[idx][2], bboxes[kept_idx][2]), + std::min(bboxes[idx][3], bboxes[kept_idx][3])}}); } float intersect_width = intersect_bbox[2] - intersect_bbox[0]; float intersect_height = intersect_bbox[3] - intersect_bbox[1]; float overlap = 0.f; - if(intersect_width > 0 && intersect_height > 0) + if (intersect_width > 0 && intersect_height > 0) { float intersect_size = intersect_width * intersect_height; - float bbox1_size = (bboxes[idx][2] < bboxes[idx][0] - || bboxes[idx][3] < bboxes[idx][1]) ? - 0.f : - (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]); - float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0] - || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ? - 0.f : - (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]); + float bbox1_size = (bboxes[idx][2] < bboxes[idx][0] || bboxes[idx][3] < bboxes[idx][1]) + ? 0.f + : (bboxes[idx][2] - bboxes[idx][0]) * + (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]); + float bbox2_size = + (bboxes[kept_idx][2] < bboxes[kept_idx][0] || bboxes[kept_idx][3] < bboxes[kept_idx][1]) + ? 0.f + : (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * + (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]); overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size); } keep = (overlap <= adaptive_threshold); @@ -371,12 +398,12 @@ void ApplyNMSFast(const std::vector &bboxes, break; } } - if(keep) + if (keep) { indices.push_back(idx); } score_index_vec.erase(score_index_vec.begin()); - if(keep && eta < 1.f && adaptive_threshold > 0.5f) + if (keep && eta < 1.f && adaptive_threshold > 0.5f) { adaptive_threshold *= eta; } @@ -385,13 +412,27 @@ void ApplyNMSFast(const std::vector &bboxes, } // namespace CPPDetectionOutputLayer::CPPDetectionOutputLayer() - : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(), - _all_prior_variances(), _all_decode_bboxes(), _all_indices() + : _input_loc(nullptr), + _input_conf(nullptr), + _input_priorbox(nullptr), + _output(nullptr), + _info(), + _num_priors(), + _num(), + _all_location_predictions(), + _all_confidence_scores(), + _all_prior_bboxes(), + _all_prior_variances(), + _all_decode_bboxes(), + _all_indices() { } -void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, - ITensor *output, DetectionOutputLayerInfo info) +void CPPDetectionOutputLayer::configure(const ITensor *input_loc, + const ITensor *input_conf, + const ITensor *input_priorbox, + ITensor *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output); ARM_COMPUTE_LOG_PARAMS(input_loc, input_conf, input_priorbox, output, info); @@ -400,11 +441,13 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum // The maximum is keep_top_k * input_loc_size[1] // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax] - const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1); + const unsigned int max_size = + info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1); auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size))); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info)); _input_loc = input_loc; _input_conf = input_conf; @@ -420,12 +463,12 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor _all_prior_variances.resize(_num_priors); _all_decode_bboxes.resize(_num); - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - for(int c = 0; c < _info.num_loc_classes(); ++c) + for (int c = 0; c < _info.num_loc_classes(); ++c) { const int label = _info.share_location() ? -1 : c; - if(label == _info.background_label_id()) + if (label == _info.background_label_id()) { // Ignore background class. continue; @@ -440,7 +483,11 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); } -Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info) +Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, + const ITensorInfo *input_conf, + const ITensorInfo *input_priorbox, + const ITensorInfo *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_loc, input_conf, input_priorbox, output, info)); return Status{}; @@ -449,7 +496,8 @@ Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITe void CPPDetectionOutputLayer::run() { // Retrieve all location predictions. - retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions); + retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), + _all_location_predictions); // Retrieve all confidences. retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores); @@ -459,75 +507,79 @@ void CPPDetectionOutputLayer::run() // Decode all loc predictions to bboxes const bool clip_bbox = false; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - for(int c = 0; c < _info.num_loc_classes(); ++c) + for (int c = 0; c < _info.num_loc_classes(); ++c) { const int label = _info.share_location() ? -1 : c; - if(label == _info.background_label_id()) + if (label == _info.background_label_id()) { // Ignore background class. continue; } - ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label); + ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), + "Could not find location predictions for label %d.", label); const std::vector &label_loc_preds = _all_location_predictions[i].find(label)->second; const int num_bboxes = _all_prior_bboxes.size(); ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4); - for(int j = 0; j < num_bboxes; ++j) + for (int j = 0; j < num_bboxes; ++j) { - DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]); + DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), + _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], + _all_decode_bboxes[i][label][j]); } } } int num_kept = 0; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; - const std::map> &conf_scores = _all_confidence_scores[i]; + const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; + const std::map> &conf_scores = _all_confidence_scores[i]; std::map> indices; - int num_det = 0; - for(int c = 0; c < _info.num_classes(); ++c) + int num_det = 0; + for (int c = 0; c < _info.num_classes(); ++c) { - if(c == _info.background_label_id()) + if (c == _info.background_label_id()) { // Ignore background class continue; } const int label = _info.share_location() ? -1 : c; - if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end()) + if (conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end()) { ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label); } const std::vector &scores = conf_scores.find(c)->second; - const std::vector &bboxes = decode_bboxes.find(label)->second; + const std::vector &bboxes = decode_bboxes.find(label)->second; - ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]); + ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), + _info.top_k(), indices[c]); num_det += indices[c].size(); } int num_to_add = 0; - if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k()) + if (_info.keep_top_k() > -1 && num_det > _info.keep_top_k()) { std::vector>> score_index_pairs; - for(auto const &it : indices) + for (auto const &it : indices) { const int label = it.first; const std::vector &label_indices = it.second; - if(conf_scores.find(label) == conf_scores.end()) + if (conf_scores.find(label) == conf_scores.end()) { ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label); } const std::vector &scores = conf_scores.find(label)->second; - for(auto idx : label_indices) + for (auto idx : label_indices) { ARM_COMPUTE_ERROR_ON(idx > static_cast(scores.size())); score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx))); @@ -541,7 +593,7 @@ void CPPDetectionOutputLayer::run() // Store the new indices. std::map> new_indices; - for(auto score_index_pair : score_index_pairs) + for (auto score_index_pair : score_index_pairs) { int label = score_index_pair.second.first; int idx = score_index_pair.second.second; @@ -562,25 +614,25 @@ void CPPDetectionOutputLayer::run() _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept))); int count = 0; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - const std::map> &conf_scores = _all_confidence_scores[i]; - const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; - for(auto &it : _all_indices[i]) + const std::map> &conf_scores = _all_confidence_scores[i]; + const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; + for (auto &it : _all_indices[i]) { const int label = it.first; const std::vector &scores = conf_scores.find(label)->second; const int loc_label = _info.share_location() ? -1 : label; - if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end()) + if (conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end()) { // Either if there are no confidence predictions // or there are no location predictions for current label. ARM_COMPUTE_ERROR_VAR("Could not find predictions for the label %d.", label); } const std::vector &bboxes = decode_bboxes.find(loc_label)->second; - const std::vector &indices = it.second; + const std::vector &indices = it.second; - for(auto idx : indices) + for (auto idx : indices) { *(reinterpret_cast(_output->ptr_to_element(Coordinates(count * 7)))) = i; *(reinterpret_cast(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label; diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp index ecbc49b3c1..2861d6cacb 100644 --- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp +++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp @@ -26,9 +26,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include #include @@ -38,53 +38,76 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, - DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox) +Status validate_arguments(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_class_score, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info, + const unsigned int kBatchSize, + const unsigned int kNumCoordBox) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize]."); - if(input_box_encoding->num_dimensions() > 2) + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, + "The location input tensor shape should be [4, N, kBatchSize]."); + if (input_box_encoding->num_dimensions() > 2) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize); + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR( + input_box_encoding->dimension(2) != kBatchSize, + "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_class_score->dimension(0) != (info.num_classes() + 1), - "The first dimension of the input class_prediction should be equal to the number of classes plus one."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, "The anchors input tensor shape should be [4, N, kBatchSize]."); - if(input_anchors->num_dimensions() > 2) + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, + "The first dimension of the input box_encoding tensor should be equal to %d.", + kNumCoordBox); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input_class_score->dimension(0) != (info.num_classes() + 1), + "The first dimension of the input class_prediction should be equal to the number of classes plus one."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, + "The anchors input tensor shape should be [4, N, kBatchSize]."); + if (input_anchors->num_dimensions() > 2) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox); + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, + "The first dimension of the input anchors tensor should be equal to %d.", + kNumCoordBox); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) - || (input_box_encoding->dimension(1) != input_anchors->dimension(1)), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) || + (input_box_encoding->dimension(1) != input_anchors->dimension(1)), "The second dimension of the inputs should be the same."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, "The num_detection output tensor shape should be [M]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), "The intersection over union should be positive and less than 1."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, "The number of max classes per detection should be positive."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, + "The num_detection output tensor shape should be [M]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), + "The intersection over union should be positive and less than 1."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, + "The number of max classes per detection should be positive."); const unsigned int num_detected_boxes = info.max_detections() * info.max_classes_per_detection(); // Validate configured outputs - if(output_boxes->total_size() != 0) + if (output_boxes->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), TensorShape(4U, num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), + TensorShape(4U, num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_boxes, 1, DataType::F32); } - if(output_classes->total_size() != 0) + if (output_classes->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), TensorShape(num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), + TensorShape(num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_classes, 1, DataType::F32); } - if(output_scores->total_size() != 0) + if (output_scores->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), TensorShape(num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), + TensorShape(num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_scores, 1, DataType::F32); } - if(num_detection->total_size() != 0) + if (num_detection->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(num_detection->tensor_shape(), TensorShape(1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_detection, 1, DataType::F32); @@ -93,15 +116,18 @@ Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorIn return Status{}; } -inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info) +inline void +DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info) { const float half_factor = 0.5f; // BBox is equavalent to CenterSizeEncoding [y,x,h,w] const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0]; const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1]; - const float half_h = half_factor * static_cast(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2]; - const float half_w = half_factor * static_cast(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3]; + const float half_h = + half_factor * static_cast(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2]; + const float half_w = + half_factor * static_cast(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3]; // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax] auto decoded_ptr = reinterpret_cast(decoded_it.ptr()); @@ -118,12 +144,15 @@ inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decode * @param[in] info The detection informations * @param[out] decoded_boxes The decoded bboxes. */ -void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *input_anchors, DetectionPostProcessLayerInfo info, Tensor *decoded_boxes) +void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, + const ITensor *input_anchors, + DetectionPostProcessLayerInfo info, + Tensor *decoded_boxes) { const QuantizationInfo &qi_box = input_box_encoding->info()->quantization_info(); const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info(); - BBox box_centersize{ {} }; - BBox anchor{ {} }; + BBox box_centersize{{}}; + BBox anchor{{}}; Window win; win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape()); @@ -133,107 +162,155 @@ void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *inp Iterator anchor_it(input_anchors, win); Iterator decoded_it(decoded_boxes, win); - if(input_box_encoding->info()->data_type() == DataType::QASYMM8) + if (input_box_encoding->info()->data_type() == DataType::QASYMM8) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); - box_centersize = BBox({ dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box), - dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box) - }); - anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors), - dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors) - }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); + box_centersize = + BBox({dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box), + dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)}); + anchor = BBox({dequantize_qasymm8(*anchor_ptr, qi_anchors), + dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors), + dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), + dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } - else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) + else if (input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); - box_centersize = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box), - dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box) - }); - anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors), - dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors) - }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); + box_centersize = BBox({dequantize_qasymm8_signed(*box_ptr, qi_box), + dequantize_qasymm8_signed(*(box_ptr + 1), qi_box), + dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), + dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)}); + anchor = BBox({dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), + dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors), + dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), + dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); - box_centersize = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) }); - anchor = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); + box_centersize = BBox({*box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr)}); + anchor = BBox({*anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } } -void SaveOutputs(const Tensor *decoded_boxes, const std::vector &result_idx_boxes_after_nms, const std::vector &result_scores_after_nms, const std::vector &result_classes_after_nms, - std::vector &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, - ITensor *num_detection) +void SaveOutputs(const Tensor *decoded_boxes, + const std::vector &result_idx_boxes_after_nms, + const std::vector &result_scores_after_nms, + const std::vector &result_classes_after_nms, + std::vector &sorted_indices, + const unsigned int num_output, + const unsigned int max_detections, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection) { // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax unsigned int i = 0; - for(; i < num_output; ++i) + for (; i < num_output; ++i) { const unsigned int box_in_idx = result_idx_boxes_after_nms[sorted_indices[i]]; - *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(0, i)))) = *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx)))); - *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(1, i)))) = *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx)))); - *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(2, i)))) = *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx)))); - *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(3, i)))) = *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx)))); - *(reinterpret_cast(output_classes->ptr_to_element(Coordinates(i)))) = static_cast(result_classes_after_nms[sorted_indices[i]]); - *(reinterpret_cast(output_scores->ptr_to_element(Coordinates(i)))) = result_scores_after_nms[sorted_indices[i]]; + *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(0, i)))) = + *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx)))); + *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(1, i)))) = + *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx)))); + *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(2, i)))) = + *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx)))); + *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(3, i)))) = + *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx)))); + *(reinterpret_cast(output_classes->ptr_to_element(Coordinates(i)))) = + static_cast(result_classes_after_nms[sorted_indices[i]]); + *(reinterpret_cast(output_scores->ptr_to_element(Coordinates(i)))) = + result_scores_after_nms[sorted_indices[i]]; } - for(; i < max_detections; ++i) + for (; i < max_detections; ++i) { *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(1, i)))) = 0.0f; *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(0, i)))) = 0.0f; *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(3, i)))) = 0.0f; *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(2, i)))) = 0.0f; - *(reinterpret_cast(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f; - *(reinterpret_cast(output_scores->ptr_to_element(Coordinates(i)))) = 0.0f; + *(reinterpret_cast(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f; + *(reinterpret_cast(output_scores->ptr_to_element(Coordinates(i)))) = 0.0f; } *(reinterpret_cast(num_detection->ptr_to_element(Coordinates(0)))) = num_output; } } // namespace CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr), - _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _dequantize_scores(false), _decoded_boxes(), _decoded_scores(), - _selected_indices(), _class_scores(), _input_scores_to_use(nullptr) + : _memory_group(std::move(memory_manager)), + _nms(), + _input_box_encoding(nullptr), + _input_scores(nullptr), + _input_anchors(nullptr), + _output_boxes(nullptr), + _output_classes(nullptr), + _output_scores(nullptr), + _num_detection(nullptr), + _info(), + _num_boxes(), + _num_classes_with_background(), + _num_max_detected_boxes(), + _dequantize_scores(false), + _decoded_boxes(), + _decoded_scores(), + _selected_indices(), + _class_scores(), + _input_scores_to_use(nullptr) { } -void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, - const ITensor *input_anchors, ITensor *output_boxes, ITensor *output_classes, - ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info) +void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, + const ITensor *input_scores, + const ITensor *input_anchors, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection, + DetectionPostProcessLayerInfo info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores); + ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, + output_scores); ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info); _num_max_detected_boxes = info.max_detections() * info.max_classes_per_detection(); - auto_init_if_empty(*output_boxes->info(), TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*output_classes->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*output_scores->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_boxes->info(), + TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_classes->info(), + TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_scores->info(), + TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); auto_init_if_empty(*num_detection->info(), TensorInfo(TensorShape(1U), 1, DataType::F32)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), output_scores->info(), - num_detection->info(), - info, _kBatchSize, _kNumCoordBox)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), + output_classes->info(), output_scores->info(), num_detection->info(), info, _kBatchSize, _kNumCoordBox)); _input_box_encoding = input_box_encoding; _input_scores = input_scores; @@ -245,13 +322,24 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _info = info; _num_boxes = input_box_encoding->info()->dimension(1); _num_classes_with_background = _input_scores->info()->dimension(0); - _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type())); - - auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32)); + _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type())); + + auto_init_if_empty(*_decoded_boxes.info(), + TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, + DataType::F32)); + auto_init_if_empty( + *_decoded_scores.info(), + TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), + 1, DataType::F32)); + auto_init_if_empty( + *_selected_indices.info(), + TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, + DataType::S32)); const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes()); - auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32)); + auto_init_if_empty( + *_class_scores.info(), + TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, + DataType::F32)); _input_scores_to_use = _dequantize_scores ? &_decoded_scores : _input_scores; @@ -260,7 +348,9 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _memory_group.manage(&_decoded_scores); _memory_group.manage(&_selected_indices); _memory_group.manage(&_class_scores); - _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), info.nms_score_threshold(), info.iou_threshold()); + _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, + info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), + info.nms_score_threshold(), info.iou_threshold()); // Allocate and reserve intermediate tensors and vectors _decoded_boxes.allocator()->allocate(); @@ -269,18 +359,28 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _class_scores.allocator()->allocate(); } -Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info) +Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_class_score, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info) { - constexpr unsigned int kBatchSize = 1; - constexpr unsigned int kNumCoordBox = 4; - const TensorInfo _decoded_boxes_info = TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32); - const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32); - const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32); - - ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, &_selected_indices_info, info.max_detections(), info.nms_score_threshold(), - info.iou_threshold())); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, output_classes, output_scores, num_detection, info, kBatchSize, kNumCoordBox)); + constexpr unsigned int kBatchSize = 1; + constexpr unsigned int kNumCoordBox = 4; + const TensorInfo _decoded_boxes_info = + TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32); + const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32); + const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32); + + ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, + &_selected_indices_info, info.max_detections(), + info.nms_score_threshold(), info.iou_threshold())); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, + output_classes, output_scores, num_detection, info, kBatchSize, + kNumCoordBox)); return Status{}; } @@ -293,62 +393,69 @@ void CPPDetectionPostProcessLayer::run() DecodeCenterSizeBoxes(_input_box_encoding, _input_anchors, _info, &_decoded_boxes); // Decode scores if necessary - if(_dequantize_scores) + if (_dequantize_scores) { - if(_input_box_encoding->info()->data_type() == DataType::QASYMM8) + if (_input_box_encoding->info()->data_type() == DataType::QASYMM8) { - for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) + for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) { - for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) + for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) { *(reinterpret_cast(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) = - dequantize_qasymm8(*(reinterpret_cast(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info()); + dequantize_qasymm8( + *(reinterpret_cast(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), + _input_scores->info()->quantization_info()); } } } - else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) + else if (_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) { - for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) + for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) { - for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) + for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) { *(reinterpret_cast(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) = - dequantize_qasymm8_signed(*(reinterpret_cast(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info()); + dequantize_qasymm8_signed(*(reinterpret_cast( + _input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), + _input_scores->info()->quantization_info()); } } } } // Regular NMS - if(_info.use_regular_nms()) + if (_info.use_regular_nms()) { std::vector result_idx_boxes_after_nms; std::vector result_classes_after_nms; std::vector result_scores_after_nms; std::vector sorted_indices; - for(unsigned int c = 0; c < num_classes; ++c) + for (unsigned int c = 0; c < num_classes; ++c) { // For each boxes get scores of the boxes for the class c - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { *(reinterpret_cast(_class_scores.ptr_to_element(Coordinates(i)))) = - *(reinterpret_cast(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1 + *(reinterpret_cast(_input_scores_to_use->ptr_to_element( + Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1 } // Run Non-maxima Suppression _nms.run(); - for(unsigned int i = 0; i < _info.detection_per_class(); ++i) + for (unsigned int i = 0; i < _info.detection_per_class(); ++i) { - const auto selected_index = *(reinterpret_cast(_selected_indices.ptr_to_element(Coordinates(i)))); - if(selected_index == -1) + const auto selected_index = + *(reinterpret_cast(_selected_indices.ptr_to_element(Coordinates(i)))); + if (selected_index == -1) { // Nms will return -1 for all the last M-elements not valid break; } result_idx_boxes_after_nms.emplace_back(selected_index); - result_scores_after_nms.emplace_back((reinterpret_cast(_class_scores.buffer()))[selected_index]); + result_scores_after_nms.emplace_back( + (reinterpret_cast(_class_scores.buffer()))[selected_index]); result_classes_after_nms.emplace_back(c); } } @@ -360,49 +467,46 @@ void CPPDetectionPostProcessLayer::run() // Sort selected indices based on result scores sorted_indices.resize(num_selected); std::iota(sorted_indices.begin(), sorted_indices.end(), 0); - std::partial_sort(sorted_indices.data(), - sorted_indices.data() + num_output, + std::partial_sort(sorted_indices.data(), sorted_indices.data() + num_output, sorted_indices.data() + num_selected, [&](unsigned int first, unsigned int second) - { - - return result_scores_after_nms[first] > result_scores_after_nms[second]; - }); + { return result_scores_after_nms[first] > result_scores_after_nms[second]; }); - SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices, - num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); + SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, + sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores, + _num_detection); } // Fast NMS else { - const unsigned int num_classes_per_box = std::min(_info.max_classes_per_detection(), _info.num_classes()); + const unsigned int num_classes_per_box = + std::min(_info.max_classes_per_detection(), _info.num_classes()); std::vector max_scores; std::vector box_indices; std::vector max_score_classes; - for(unsigned int b = 0; b < _num_boxes; ++b) + for (unsigned int b = 0; b < _num_boxes; ++b) { std::vector box_scores; - for(unsigned int c = 0; c < num_classes; ++c) + for (unsigned int c = 0; c < num_classes; ++c) { - box_scores.emplace_back(*(reinterpret_cast(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b))))); + box_scores.emplace_back( + *(reinterpret_cast(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b))))); } std::vector max_score_indices; max_score_indices.resize(_info.num_classes()); std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0); - std::partial_sort(max_score_indices.data(), - max_score_indices.data() + num_classes_per_box, + std::partial_sort(max_score_indices.data(), max_score_indices.data() + num_classes_per_box, max_score_indices.data() + num_classes, [&](unsigned int first, unsigned int second) - { - return box_scores[first] > box_scores[second]; - }); + { return box_scores[first] > box_scores[second]; }); - for(unsigned int i = 0; i < num_classes_per_box; ++i) + for (unsigned int i = 0; i < num_classes_per_box; ++i) { - const float score_to_add = box_scores[max_score_indices[i]]; - *(reinterpret_cast(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add; + const float score_to_add = box_scores[max_score_indices[i]]; + *(reinterpret_cast(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = + score_to_add; max_scores.emplace_back(score_to_add); box_indices.emplace_back(b); max_score_classes.emplace_back(max_score_indices[i]); @@ -412,10 +516,10 @@ void CPPDetectionPostProcessLayer::run() // Run Non-maxima Suppression _nms.run(); std::vector selected_indices; - for(unsigned int i = 0; i < max_detections; ++i) + for (unsigned int i = 0; i < max_detections; ++i) { // NMS returns M valid indices, the not valid tail is filled with -1 - if(*(reinterpret_cast(_selected_indices.ptr_to_element(Coordinates(i)))) == -1) + if (*(reinterpret_cast(_selected_indices.ptr_to_element(Coordinates(i)))) == -1) { // Nms will return -1 for all the last M-elements not valid break; @@ -425,8 +529,8 @@ void CPPDetectionPostProcessLayer::run() // We select the max detection numbers of the highest score of all classes const auto num_output = std::min(_info.max_detections(), selected_indices.size()); - SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, - num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); + SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, num_output, + max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); } } } // namespace arm_compute diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp index 6d01b127c0..3217742c6b 100644 --- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp +++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp @@ -29,9 +29,12 @@ namespace arm_compute { -void CPPNonMaximumSuppression::configure( - const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size, - const float score_threshold, const float nms_threshold) +void CPPNonMaximumSuppression::configure(const ITensor *bboxes, + const ITensor *scores, + ITensor *indices, + unsigned int max_output_size, + const float score_threshold, + const float nms_threshold) { ARM_COMPUTE_LOG_PARAMS(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold); @@ -40,10 +43,14 @@ void CPPNonMaximumSuppression::configure( _kernel = std::move(k); } -Status CPPNonMaximumSuppression::validate( - const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size, - const float score_threshold, const float nms_threshold) +Status CPPNonMaximumSuppression::validate(const ITensorInfo *bboxes, + const ITensorInfo *scores, + const ITensorInfo *indices, + unsigned int max_output_size, + const float score_threshold, + const float nms_threshold) { - return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold); + return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, + nms_threshold); } } // namespace arm_compute diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp index 62a74735a2..3d64def804 100644 --- a/src/runtime/CPP/functions/CPPTopKV.cpp +++ b/src/runtime/CPP/functions/CPPTopKV.cpp @@ -38,7 +38,10 @@ void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITe _kernel = std::move(kernel); } -Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k) +Status CPPTopKV::validate(const ITensorInfo *predictions, + const ITensorInfo *targets, + ITensorInfo *output, + const unsigned int k) { return CPPTopKVKernel::validate(predictions, targets, output, k); } diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp index 436fd9ca16..ecf84abd2c 100644 --- a/src/runtime/IScheduler.cpp +++ b/src/runtime/IScheduler.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Log.h" #include "arm_compute/core/Window.h" + #include "src/common/cpuinfo/CpuInfo.h" #include "src/runtime/SchedulerUtils.h" @@ -59,7 +60,7 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); #ifndef BARE_METAL const Window &max_window = window; - if(hints.split_dimension() == IScheduler::split_dimensions_all) + if (hints.split_dimension() == IScheduler::split_dimensions_all) { /* * if the split dim is size_t max then this signals we should parallelise over @@ -73,27 +74,27 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n); std::vector workloads; - for(unsigned int ni = 0; ni != n_threads; ++ni) + for (unsigned int ni = 0; ni != n_threads; ++ni) { - for(unsigned int mi = 0; mi != m_threads; ++mi) + for (unsigned int mi = 0; mi != m_threads; ++mi) { workloads.push_back( - [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info) - { - //narrow the window to our mi-ni workload - Window win = max_window.split_window(Window::DimX, mi, m_threads) - .split_window(Window::DimY, ni, n_threads); + [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo &info) + { + //narrow the window to our mi-ni workload + Window win = max_window.split_window(Window::DimX, mi, m_threads) + .split_window(Window::DimY, ni, n_threads); - win.validate(); + win.validate(); - Window thread_locator; - thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); - thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); + Window thread_locator; + thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); + thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); - thread_locator.validate(); + thread_locator.validate(); - kernel->run_nd(win, info, thread_locator); - }); + kernel->run_nd(win, info, thread_locator); + }); } } run_workloads(workloads); @@ -103,16 +104,16 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); const unsigned int num_threads = std::min(num_iterations, this->num_threads()); - if(num_iterations == 0) + if (num_iterations == 0) { return; } - if(!kernel->is_parallelisable() || num_threads == 1) + if (!kernel->is_parallelisable() || num_threads == 1) { ThreadInfo info; info.cpu_info = &cpu_info(); - if(tensors.empty()) + if (tensors.empty()) { kernel->run(max_window, info); } @@ -124,14 +125,15 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W else { unsigned int num_windows = 0; - switch(hints.strategy()) + switch (hints.strategy()) { case StrategyHint::STATIC: num_windows = num_threads; break; case StrategyHint::DYNAMIC: { - const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast(hints.threshold()); + const unsigned int granule_threshold = + (hints.threshold() <= 0) ? num_threads : static_cast(hints.threshold()); // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; break; @@ -143,15 +145,15 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info()); std::vector workloads(num_windows); - for(unsigned int t = 0; t < num_windows; ++t) + for (unsigned int t = 0; t < num_windows; ++t) { //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info) + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info) { Window win = max_window.split_window(hints.split_dimension(), t, num_windows); win.validate(); - if(tensors.empty()) + if (tensors.empty()) { kernel->run(win, info); } @@ -175,36 +177,43 @@ void IScheduler::run_tagged_workloads(std::vector &workloads, const ch run_workloads(workloads); } -std::size_t IScheduler::adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info) +std::size_t IScheduler::adjust_num_of_windows(const Window &window, + std::size_t split_dimension, + std::size_t init_num_windows, + const ICPPKernel &kernel, + const CPUInfo &cpu_info) { // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow"). - if(window.num_iterations(split_dimension) < init_num_windows) + if (window.num_iterations(split_dimension) < init_num_windows) { auto recommended_split_dim = Window::DimX; - for(std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims) + for (std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims) { - if(window.num_iterations(recommended_split_dim) < window.num_iterations(dims)) + if (window.num_iterations(recommended_split_dim) < window.num_iterations(dims)) { recommended_split_dim = dims; } } - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim", split_dimension, - recommended_split_dim); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim", + split_dimension, recommended_split_dim); } - for(auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first + for (auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first { // Try splitting the workload into t, subject to each subworkload size <= mws. - if((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t) + if ((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t) { - if(t != init_num_windows) + if (t != init_num_windows) { - ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using a different thread count than the one assigned by the user."); + ARM_COMPUTE_LOG_INFO_MSG_CORE( + "The scheduler is using a different thread count than the one assigned by the user."); } return t; } } - ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using single thread instead of the thread count assigned by the user."); + ARM_COMPUTE_LOG_INFO_MSG_CORE( + "The scheduler is using single thread instead of the thread count assigned by the user."); return 1; // If the workload is so small that it can't be split, we should run a single thread } diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp index a6bc950644..8e5b62ae7d 100644 --- a/src/runtime/ISimpleLifetimeManager.cpp +++ b/src/runtime/ISimpleLifetimeManager.cpp @@ -43,7 +43,7 @@ ISimpleLifetimeManager::ISimpleLifetimeManager() void ISimpleLifetimeManager::register_group(IMemoryGroup *group) { - if(_active_group == nullptr) + if (_active_group == nullptr) { ARM_COMPUTE_ERROR_ON(group == nullptr); _active_group = group; @@ -52,12 +52,12 @@ void ISimpleLifetimeManager::register_group(IMemoryGroup *group) bool ISimpleLifetimeManager::release_group(IMemoryGroup *group) { - if(group == nullptr) + if (group == nullptr) { return false; } const bool status = bool(_finalized_groups.erase(group)); - if(status) + if (status) { group->mappings().clear(); } @@ -67,12 +67,13 @@ bool ISimpleLifetimeManager::release_group(IMemoryGroup *group) void ISimpleLifetimeManager::start_lifetime(void *obj) { ARM_COMPUTE_ERROR_ON(obj == nullptr); - ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!"); + ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), + "Memory object is already registered!"); // Check if there is a free blob - if(_free_blobs.empty()) + if (_free_blobs.empty()) { - _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } }); + _occupied_blobs.emplace_front(Blob{obj, 0, 0, {obj}}); } else { @@ -100,10 +101,8 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t el.status = true; // Find object in the occupied lists - auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b) - { - return obj == b.id; - }); + auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), + [&obj](const Blob &b) { return obj == b.id; }); ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs)); // Update occupied blob and return as free @@ -114,7 +113,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it); // Check if all objects are finalized and reset active group - if(are_all_finalized()) + if (are_all_finalized()) { ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty()); @@ -133,9 +132,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t bool ISimpleLifetimeManager::are_all_finalized() const { - return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair &e) - { - return !e.second.status; - }); + return !std::any_of(std::begin(_active_elements), std::end(_active_elements), + [](const std::pair &e) { return !e.second.status; }); } } // namespace arm_compute diff --git a/src/runtime/IWeightsManager.cpp b/src/runtime/IWeightsManager.cpp index 373c50c73d..96287dcc49 100644 --- a/src/runtime/IWeightsManager.cpp +++ b/src/runtime/IWeightsManager.cpp @@ -25,14 +25,13 @@ namespace arm_compute { -IWeightsManager::IWeightsManager() - : _managed_weights(), _managed_counter(), _managed_weights_parents() +IWeightsManager::IWeightsManager() : _managed_weights(), _managed_counter(), _managed_weights_parents() { } void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent) { - if(!are_weights_managed(weights)) + if (!are_weights_managed(weights)) { _managed_weights[weights]; _managed_counter[weights]; @@ -44,9 +43,9 @@ void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent) // In case the weights are an output of a previous reshape function // store the parent's link - if(parent != nullptr) + if (parent != nullptr) { - if(_managed_weights_parents.find(weights) == _managed_weights_parents.end()) + if (_managed_weights_parents.find(weights) == _managed_weights_parents.end()) { _managed_weights_parents[weights] = parent; } @@ -59,13 +58,13 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Find if I have the same weights with weights transform. If I do, don't run the reshape auto item = _managed_weights.find(weights); - bool perform_run{ true }; - ITensor *weights_tensor{ nullptr }; + bool perform_run{true}; + ITensor *weights_tensor{nullptr}; // Check if I already have the requested transform and I have run the reshape function - for(auto it : item->second) + for (auto it : item->second) { - if(it->is_reshape_run() && (it->uid() == weights_transform->uid())) + if (it->is_reshape_run() && (it->uid() == weights_transform->uid())) { weights_tensor = it->get_weights(); perform_run = false; @@ -73,7 +72,7 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights } } - if(perform_run) + if (perform_run) { weights_transform->run(); weights_tensor = weights_transform->get_weights(); @@ -81,10 +80,10 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Check if we can release memory from parent auto parent_item = _managed_weights_parents.find(weights); - if(parent_item != _managed_weights_parents.end()) + if (parent_item != _managed_weights_parents.end()) { int32_t refcount = parent_item->second->decrease_refcount(); - if(refcount == 0) + if (refcount == 0) { parent_item->second->release(); } @@ -92,20 +91,20 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Check top level weights. If all the transformations are done // mark the weights as unused - if(_managed_weights_parents.find(weights) == _managed_weights_parents.end()) + if (_managed_weights_parents.find(weights) == _managed_weights_parents.end()) { auto item = _managed_weights.find(weights); bool mark_as_unused = true; - for(auto it : item->second) + for (auto it : item->second) { - if(!it->is_reshape_run()) + if (!it->is_reshape_run()) { mark_as_unused = false; break; } } - if(mark_as_unused) + if (mark_as_unused) { weights->mark_as_unused(); } @@ -123,15 +122,15 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei { ARM_COMPUTE_ERROR_ON_MSG(!are_weights_managed(weights), "Cannot acquire weights. Weights are not managed"); - ITensor *transformed_weights{ nullptr }; + ITensor *transformed_weights{nullptr}; auto item = _managed_weights.find(weights); // Check if I already have the requested transform. If I do, // increase the refcount of the transformed weights object and // reuse the tensor - for(auto it : item->second) + for (auto it : item->second) { - if(it->uid() == weights_transform->uid()) + if (it->uid() == weights_transform->uid()) { transformed_weights = it->get_weights(); it->increase_refcount(); @@ -139,7 +138,7 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei } } - if(transformed_weights == nullptr) + if (transformed_weights == nullptr) { transformed_weights = weights_transform->get_weights(); weights_transform->increase_refcount(); @@ -154,13 +153,13 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei void IWeightsManager::release(const ITensor *weights) { - if(weights == nullptr || !are_weights_managed(weights)) + if (weights == nullptr || !are_weights_managed(weights)) { return; } _managed_counter[weights].counter--; - if(_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused) + if (_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused) { weights->mark_as_unused(); } @@ -168,7 +167,7 @@ void IWeightsManager::release(const ITensor *weights) void IWeightsManager::pre_mark_as_unused(const ITensor *weights) { - if(weights == nullptr || !are_weights_managed(weights)) + if (weights == nullptr || !are_weights_managed(weights)) { return; } diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp index ac0a32539e..90fd025eb7 100644 --- a/src/runtime/Memory.cpp +++ b/src/runtime/Memory.cpp @@ -27,20 +27,17 @@ namespace arm_compute { -Memory::Memory() - : _region(nullptr), _region_owned(nullptr) +Memory::Memory() : _region(nullptr), _region_owned(nullptr) { } -Memory::Memory(const std::shared_ptr &memory) - : _region(nullptr), _region_owned(memory) +Memory::Memory(const std::shared_ptr &memory) : _region(nullptr), _region_owned(memory) { _region_owned = memory; _region = _region_owned.get(); } -Memory::Memory(IMemoryRegion *memory) - : _region(memory), _region_owned(nullptr) +Memory::Memory(IMemoryRegion *memory) : _region(memory), _region_owned(nullptr) { _region = memory; } diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp index 2e418ae9e3..5fa9ea47e9 100644 --- a/src/runtime/MemoryManagerOnDemand.cpp +++ b/src/runtime/MemoryManagerOnDemand.cpp @@ -31,7 +31,8 @@ namespace arm_compute { -MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr lifetime_manager, std::shared_ptr pool_manager) +MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr lifetime_manager, + std::shared_ptr pool_manager) : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager)) { ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!"); @@ -57,7 +58,7 @@ void MemoryManagerOnDemand::populate(arm_compute::IAllocator &allocator, size_t // Create pools auto pool_template = _lifetime_mgr->create_pool(&allocator); - for(int i = num_pools; i > 1; --i) + for (int i = num_pools; i > 1; --i) { auto pool = pool_template->duplicate(); _pool_mgr->register_pool(std::move(pool)); diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp index a5fc0a2726..fcfd3251ff 100644 --- a/src/runtime/NEON/INEOperator.cpp +++ b/src/runtime/NEON/INEOperator.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/INEOperator.h" + #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -32,14 +34,13 @@ namespace experimental { INEOperator::~INEOperator() = default; -INEOperator::INEOperator(IRuntimeContext *ctx) - : _kernel(), _ctx(ctx), _workspace() +INEOperator::INEOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace() { } void INEOperator::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp index 5438bce62a..b6977221b9 100644 --- a/src/runtime/NEON/INESimpleFunction.cpp +++ b/src/runtime/NEON/INESimpleFunction.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CPP/ICPPKernel.h" #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/kernels/NEFillBorderKernel.h" namespace arm_compute @@ -33,8 +34,7 @@ namespace arm_compute INESimpleFunction::~INESimpleFunction() = default; INESimpleFunction::INESimpleFunction() // NOLINT - : _kernel(), - _border_handler() + : _kernel(), _border_handler() { } diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp index 21dd58e378..04bff9fa4b 100644 --- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp +++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/INEKernel.h" #include "src/runtime/Utils.h" @@ -32,9 +33,7 @@ namespace arm_compute { INESimpleFunctionNoBorder::~INESimpleFunctionNoBorder() = default; -INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) - : _kernel(), - _ctx(ctx) +INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) : _kernel(), _ctx(ctx) { } diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp index e48aede590..59199452ce 100644 --- a/src/runtime/NEON/functions/NEActivationLayer.cpp +++ b/src/runtime/NEON/functions/NEActivationLayer.cpp @@ -24,24 +24,24 @@ #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuActivation.h" namespace arm_compute { struct NEActivationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - IRuntimeContext *ctx{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + IRuntimeContext *ctx{nullptr}; + std::unique_ptr op{nullptr}; }; -NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) - : _impl(std::make_unique()) +NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) : _impl(std::make_unique()) { _impl->ctx = ctx; } -NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default; +NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default; NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default; NEActivationLayer::~NEActivationLayer() = default; @@ -56,7 +56,8 @@ void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLay _impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info); } -Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status +NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) { return cpu::CpuActivation::validate(input, output, act_info); } diff --git a/src/runtime/NEON/functions/NEAddMulAdd.cpp b/src/runtime/NEON/functions/NEAddMulAdd.cpp index cfeaefc4fd..a72364791c 100644 --- a/src/runtime/NEON/functions/NEAddMulAdd.cpp +++ b/src/runtime/NEON/functions/NEAddMulAdd.cpp @@ -25,6 +25,7 @@ #include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuAddMulAdd.h" @@ -33,45 +34,50 @@ namespace arm_compute { struct NEAddMulAdd::Impl { - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; WorkspaceData workspace_tensors{}; ITensorPack run_pack{}; MemoryGroup memory_group{}; }; -NEAddMulAdd::NEAddMulAdd(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEAddMulAdd::NEAddMulAdd(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); } NEAddMulAdd::~NEAddMulAdd() = default; -void NEAddMulAdd::configure(ITensor *input1, ITensor *input2, ITensor *bn_mul, ITensor *bn_add, ITensor *add_output, - ITensor *final_output, const ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEAddMulAdd::configure(ITensor *input1, + ITensor *input2, + ITensor *bn_mul, + ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + const ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); - _impl->op = std::make_unique(); - _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), - bn_add->info(), add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info); + _impl->op = std::make_unique(); + _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), bn_add->info(), + add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, input1 }, - { TensorType::ACL_SRC_1, input2 }, - { TensorType::ACL_SRC_2, bn_mul }, - { TensorType::ACL_SRC_3, bn_add }, - { TensorType::ACL_DST_0, add_output }, - { TensorType::ACL_DST_1, final_output }, + _impl->run_pack = { + {TensorType::ACL_SRC_0, input1}, {TensorType::ACL_SRC_1, input2}, {TensorType::ACL_SRC_2, bn_mul}, + {TensorType::ACL_SRC_3, bn_add}, {TensorType::ACL_DST_0, add_output}, {TensorType::ACL_DST_1, final_output}, }; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *bn_mul, - const ITensorInfo *bn_add, const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status NEAddMulAdd::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return cpu::CpuAddMulAdd::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); } diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp index 3ac127b02e..fbaf1a96e7 100644 --- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp +++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp @@ -32,6 +32,7 @@ #include "arm_compute/runtime/NEON/functions/NECast.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" @@ -48,8 +49,7 @@ struct NEArgMinMaxLayer::Impl NEArgMinMaxLayer::~NEArgMinMaxLayer() = default; -NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_manager = std::move(memory_manager); } @@ -58,7 +58,8 @@ void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, cons { ARM_COMPUTE_LOG_PARAMS(input, axis, output, op); _impl->reduction_function = std::make_unique(); - if(output->info() && (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64)) + if (output->info() && + (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64)) { _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); _impl->cast_function = std::make_unique(); @@ -74,9 +75,11 @@ void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, cons } } -Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) +Status +NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Invalid operation"); return NEReductionOperation::validate(input, output, axis, op, false); } @@ -84,7 +87,7 @@ void NEArgMinMaxLayer::run() { MemoryGroupResourceScope scope_mg(_impl->memory_group); _impl->reduction_function->run(); - if(_impl->tmp_reduction_result != nullptr) + if (_impl->tmp_reduction_result != nullptr) { _impl->cast_function->run(); } diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp index a7581ca9f4..aff16ae9d1 100644 --- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp +++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuAdd.h" #include @@ -32,26 +33,33 @@ namespace arm_compute { struct NEArithmeticAddition::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEArithmeticAddition::NEArithmeticAddition() - : _impl(std::make_unique()) +NEArithmeticAddition::NEArithmeticAddition() : _impl(std::make_unique()) { } -NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default; +NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default; NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default; NEArithmeticAddition::~NEArithmeticAddition() = default; -Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status NEArithmeticAddition::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return cpu::CpuAdd::validate(input1, input2, output, policy, act_info); } -void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEArithmeticAddition::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp index 6fdd4267bf..097525c1a8 100644 --- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp +++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/operators/CpuSub.h" #include @@ -32,26 +33,33 @@ namespace arm_compute { struct NEArithmeticSubtraction::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEArithmeticSubtraction::NEArithmeticSubtraction() - : _impl(std::make_unique()) +NEArithmeticSubtraction::NEArithmeticSubtraction() : _impl(std::make_unique()) { } -NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default; +NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default; NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default; NEArithmeticSubtraction::~NEArithmeticSubtraction() = default; -Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return cpu::CpuSub::validate(input1, input2, output, policy, act_info); } -void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEArithmeticSubtraction::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp index db49f4c1a0..d491f0aafc 100644 --- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h" @@ -36,12 +37,17 @@ namespace arm_compute { NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default; -NEBatchNormalizationLayer::NEBatchNormalizationLayer() - : _norm_kernel() +NEBatchNormalizationLayer::NEBatchNormalizationLayer() : _norm_kernel() { } -void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, +void NEBatchNormalizationLayer::configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, ActivationLayerInfo act_info) { ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info); @@ -50,10 +56,17 @@ void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const _norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info); } -Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { - ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info)); return Status{}; } diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp index e258028d05..5d711c5ddf 100644 --- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h" @@ -41,19 +42,25 @@ void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_s _kernel = std::move(k); } -void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) +void NEBatchToSpaceLayer::configure( + const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) { auto k = std::make_unique(); k->configure(input, block_shape_x, block_shape_y, output, crop_info); _kernel = std::move(k); } -Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { return NEBatchToSpaceLayerKernel::validate(input, block_shape, output); } -Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info); } diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp index 90eb72706e..89ce2087be 100644 --- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp +++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h" -#include "src/core/NEON/kernels/NEBitwiseAndKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseAndKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp index 69e5288b88..eda59cd3e9 100644 --- a/src/runtime/NEON/functions/NEBitwiseNot.cpp +++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h" -#include "src/core/NEON/kernels/NEBitwiseNotKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseNotKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp index 0b19e919ee..3d6f30b0fe 100644 --- a/src/runtime/NEON/functions/NEBitwiseOr.cpp +++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h" -#include "src/core/NEON/kernels/NEBitwiseOrKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseOrKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp index cc9df9f1c4..f0cf3d3e5c 100644 --- a/src/runtime/NEON/functions/NEBitwiseXor.cpp +++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h" -#include "src/core/NEON/kernels/NEBitwiseXorKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseXorKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp index af00171be6..adf891e417 100644 --- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp +++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp @@ -22,12 +22,16 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h" namespace arm_compute { -void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info) +void NEBoundingBoxTransform::configure(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info); // Configure Bounding Box kernel @@ -36,7 +40,10 @@ void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes _kernel = std::move(k); } -Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info); } diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp index f93a6ea745..1fd172a730 100644 --- a/src/runtime/NEON/functions/NECast.cpp +++ b/src/runtime/NEON/functions/NECast.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NECast.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuCast.h" @@ -31,16 +32,15 @@ namespace arm_compute { struct NECast::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NECast::NECast() - : _impl(std::make_unique()) +NECast::NECast() : _impl(std::make_unique()) { } -NECast::NECast(NECast &&) = default; +NECast::NECast(NECast &&) = default; NECast &NECast::operator=(NECast &&) = default; NECast::~NECast() = default; @@ -62,7 +62,7 @@ Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, Con void NECast::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp index 8b96fadb74..86bee4dd43 100644 --- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp +++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h" #include "arm_compute/core/Types.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h" diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp index ceb697aad6..59a0892f1f 100644 --- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp +++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp @@ -23,33 +23,31 @@ */ #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" -#include "src/cpu/operators/CpuConcatenate.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/helpers/AutoConfiguration.h" +#include "src/cpu/operators/CpuConcatenate.h" namespace arm_compute { struct NEConcatenateLayer::Impl { std::vector srcs{}; - ITensor *dst{ nullptr }; - unsigned int num_inputs{ 0 }; - unsigned int axis{ 0 }; - std::unique_ptr op{ nullptr }; + ITensor *dst{nullptr}; + unsigned int num_inputs{0}; + unsigned int axis{0}; + std::unique_ptr op{nullptr}; }; -NEConcatenateLayer::NEConcatenateLayer() - : _impl(std::make_unique()) +NEConcatenateLayer::NEConcatenateLayer() : _impl(std::make_unique()) { } -NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default; +NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default; NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default; NEConcatenateLayer::~NEConcatenateLayer() = default; @@ -64,7 +62,7 @@ void NEConcatenateLayer::configure(std::vector inputs_vector, I _impl->op = std::make_unique(); std::vector inputs_vector_info; - for(unsigned int i = 0; i < inputs_vector.size(); ++i) + for (unsigned int i = 0; i < inputs_vector.size(); ++i) { ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i)); inputs_vector_info.emplace_back(inputs_vector.at(i)->info()); @@ -72,7 +70,9 @@ void NEConcatenateLayer::configure(std::vector inputs_vector, I _impl->op->configure(inputs_vector_info, _impl->dst->info(), axis); } -Status NEConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) +Status NEConcatenateLayer::validate(const std::vector &inputs_vector, + const ITensorInfo *output, + size_t axis) { return cpu::CpuConcatenate::validate(inputs_vector, output, axis); } @@ -80,7 +80,7 @@ Status NEConcatenateLayer::validate(const std::vector &inpu void NEConcatenateLayer::run() { ITensorPack pack; - for(unsigned i = 0; i < _impl->num_inputs; ++i) + for (unsigned i = 0; i < _impl->num_inputs; ++i) { pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i)); } diff --git a/src/runtime/NEON/functions/NEConv3D.cpp b/src/runtime/NEON/functions/NEConv3D.cpp index 3bb66c44b0..8f41151d6c 100644 --- a/src/runtime/NEON/functions/NEConv3D.cpp +++ b/src/runtime/NEON/functions/NEConv3D.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuDirectConv3d.h" @@ -35,35 +36,41 @@ using namespace arm_compute::experimental; struct NEConv3D::Impl { - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; }; -NEConv3D::NEConv3D() - : _impl(std::make_unique()) +NEConv3D::NEConv3D() : _impl(std::make_unique()) { } NEConv3D::~NEConv3D() = default; -void NEConv3D::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info) +void NEConv3D::configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info)); + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info); auto f = std::make_unique(); - f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info); + f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), + conv_info); _impl->op = std::move(f); - if(_impl->op != nullptr) + if (_impl->op != nullptr) { - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; } } -Status NEConv3D::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv3dInfo &conv_info) +Status NEConv3D::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv3dInfo &conv_info) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuDirectConv3d::validate(input, weights, biases, output, conv_info)); @@ -72,7 +79,7 @@ Status NEConv3D::validate(const ITensorInfo *input, const ITensorInfo *weights, void NEConv3D::run() { - if(_impl->op != nullptr) + if (_impl->op != nullptr) { _impl->op->run(_impl->run_pack); } diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp index 535ac99001..84e8565aaf 100644 --- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp +++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp @@ -24,24 +24,26 @@ #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" namespace arm_compute { struct NEConvertFullyConnectedWeights::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() - : _impl(std::make_unique()) +NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() : _impl(std::make_unique()) { } NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default; -void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void NEConvertFullyConnectedWeights::configure(const ITensor *input, + ITensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -51,8 +53,10 @@ void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *ou _impl->op->configure(_impl->src->info(), _impl->dst->info(), original_input_shape, data_layout); } -Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, - DataLayout data_layout) +Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { return cpu::CpuConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout); } @@ -64,4 +68,4 @@ void NEConvertFullyConnectedWeights::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index 89e0e498c9..37958fc2e9 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuConv2d.h" @@ -43,34 +44,44 @@ struct NEConvolutionLayer::Impl { MemoryGroup memory_group{}; std::shared_ptr memory_manager{}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - std::unique_ptr func{ nullptr }; + std::unique_ptr func{nullptr}; }; -NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_manager = std::move(memory_manager); } NEConvolutionLayer::~NEConvolutionLayer() = default; -void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void NEConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_UNUSED(num_groups); - ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); - ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math)) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::GEMM: @@ -78,7 +89,8 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const case ConvolutionMethod::DIRECT: { auto f = std::make_unique(); - f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), + output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); _impl->op = std::move(f); break; } @@ -94,33 +106,46 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const break; } - if(_impl->op) + if (_impl->op) { _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } } -Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status NEConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); - switch(cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::GEMM: case ConvolutionMethod::GEMM_CONV2D: case ConvolutionMethod::DIRECT: - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info, + weights_info, dilation, act_info, enable_fast_math, + num_groups)); break; case ConvolutionMethod::FFT: - ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); break; default: ARM_COMPUTE_ERROR("Not supported."); @@ -129,12 +154,17 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo return Status{}; } -ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, - const ActivationLayerInfo &act_info, bool enable_fast_math) +ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math); + return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math); } void NEConvolutionLayer::run() @@ -143,7 +173,7 @@ void NEConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_impl->memory_group); - if(_impl->func) + if (_impl->func) { _impl->func->run(); } @@ -155,7 +185,7 @@ void NEConvolutionLayer::run() void NEConvolutionLayer::prepare() { - if(_impl->func) + if (_impl->func) { _impl->func->prepare(); } diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp index c2059e8e98..c975d3a5b5 100644 --- a/src/runtime/NEON/functions/NECopy.cpp +++ b/src/runtime/NEON/functions/NECopy.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NECopy.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuCopy.h" #include @@ -32,16 +33,15 @@ namespace arm_compute { struct NECopy::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NECopy::NECopy() - : _impl(std::make_unique()) +NECopy::NECopy() : _impl(std::make_unique()) { } -NECopy::NECopy(NECopy &&) = default; +NECopy::NECopy(NECopy &&) = default; NECopy &NECopy::operator=(NECopy &&) = default; NECopy::~NECopy() = default; diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp index cca8b400ee..a94b0882da 100644 --- a/src/runtime/NEON/functions/NECropResize.cpp +++ b/src/runtime/NEON/functions/NECropResize.cpp @@ -21,10 +21,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/runtime/NEON/functions/NECropResize.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NECropKernel.h" @@ -35,18 +36,32 @@ namespace arm_compute NECropResize::~NECropResize() = default; NECropResize::NECropResize() - : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results() + : _output(nullptr), + _num_boxes(0), + _method(), + _extrapolation_value(0), + _crop(), + _scale(), + _crop_results(), + _scaled_results() { } -Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) +Status NECropResize::validate(const ITensorInfo *input, + const ITensorInfo *boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0); ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA); TensorInfo temp_info; - ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value)); - if(output->total_size() > 0) + ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), + box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, + extrapolation_value)); + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -56,11 +71,17 @@ Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes return Status{}; } -void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void NECropResize::configure(const ITensor *input, + const ITensor *boxes, + const ITensor *box_ind, + ITensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), + crop_size, method, extrapolation_value)); ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value); _num_boxes = boxes->info()->tensor_shape()[1]; @@ -81,7 +102,7 @@ void NECropResize::configure(const ITensor *input, const ITensor *boxes, const I _scaled_results.reserve(_num_boxes); _scale.reserve(_num_boxes); - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { auto crop_tensor = std::make_unique(); TensorInfo crop_result_info(1, DataType::F32); @@ -108,7 +129,7 @@ void NECropResize::run() { ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { // Size of the crop box in _boxes and thus the shape of _crop_results[i] // may not be known until run-time and so the kernels cannot be configured until then. @@ -117,12 +138,15 @@ void NECropResize::run() NEScheduler::get().schedule(_crop[i].get(), Window::DimZ); // Scale the cropped image. - _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false }); + _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), + ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), + SamplingPolicy::TOP_LEFT, false}); _scaled_results[i]->allocator()->allocate(); _scale[i]->run(); // Copy scaled image into output. - std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i))); + std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), + _output->ptr_to_element(Coordinates(0, 0, 0, i))); } } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp index 439aff0840..3987370d9e 100644 --- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp @@ -25,9 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" @@ -61,7 +62,8 @@ PadStrideInfo compute_upsample_info(const PadStrideInfo &info, uint32_t deconv_p deconv_pad_top += deconv_pad_y / 2; deconv_pad_bottom += deconv_pad_y / 2; - return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR); + return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, + DimensionRoundingType::FLOOR); } } // namespace @@ -82,17 +84,24 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr memor { } -Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info, - bool enable_fast_math, const WeightsInfo &weights_info) +Status NEDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &info, + bool enable_fast_math, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); + const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) < 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); - if(is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); } @@ -101,11 +110,13 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); } - auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info); + auto out_dims = + deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), + weights->dimension(width_idx), weights->dimension(height_idx), info); - if(bias != nullptr) + if (bias != nullptr) { - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -115,15 +126,18 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf } } - if(output->tensor_shape().total_size() > 0) + if (output->tensor_shape().total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), + "Output's depth is invalid."); } uint32_t deconv_pad_x = 0; @@ -141,44 +155,61 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON((out_x - weights->dimension(idx_w) + 1) > out_dims.first); ARM_COMPUTE_RETURN_ERROR_ON((out_y - weights->dimension(idx_h) + 1) > out_dims.second); - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, + out_dims, deconv_pad_x, deconv_pad_y); + TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); // Do not perform upsampling when the operation uses unit stride in all dimensions const bool do_upsampling = stride_x != 1 || stride_y != 1; - const unsigned int batches_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); - const unsigned int channel_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int batches_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + const unsigned int channel_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx)); - if(do_upsampling) + if (do_upsampling) { const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, + weights_info, Size2D(1U, 1U), ActivationLayerInfo(), + enable_fast_math)); } else { - const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math)); + const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), + upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, + Size2D(1U, 1U), ActivationLayerInfo(), + enable_fast_math)); } return Status{}; } -void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math, const WeightsInfo &weights_info) +void NEDeconvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *bias, + ITensor *output, + const PadStrideInfo &info, + bool enable_fast_math, + const WeightsInfo &weights_info) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info, enable_fast_math, weights_info)); + ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), + (bias == nullptr) ? nullptr : bias->info(), + output->info(), info, enable_fast_math, weights_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info); const DataLayout data_layout = input->info()->data_layout(); const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - auto out_dims = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx), - weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info); + auto out_dims = deconvolution_output_dimensions( + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info()); @@ -191,7 +222,8 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con const unsigned int stride_y = info.stride().second; // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); @@ -199,12 +231,11 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con _flip_weights.configure(weights, &_weights_flipped, &_flip_axis); // setup the function to convolve the upscaled output - uint32_t deconv_pad_x = 0; - uint32_t deconv_pad_y = 0; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), - stride_x, stride_y, - out_dims, deconv_pad_x, deconv_pad_y); - const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); + uint32_t deconv_pad_x = 0; + uint32_t deconv_pad_y = 0; + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape( + *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); + const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); // Do not perform upsampling when the operation uses unit stride in all dimensions _do_upsampling = stride_x != 1 || stride_y != 1; @@ -216,12 +247,12 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con axis_data[1] = static_cast(height_idx); // Setup convolution and upsampling, if needed - if(_do_upsampling) + if (_do_upsampling) { _memory_group.manage(&_scaled_output); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); scale_out_info.set_data_layout(data_layout); _scaled_output.allocator()->init(scale_out_info); @@ -229,14 +260,17 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con // The padding amount can be given as input to the convolution layer. _upsample_f.configure(input, &_scaled_output, upsample_info); - _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), + ActivationLayerInfo(), enable_fast_math); _scaled_output.allocator()->allocate(); } else { - const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); - _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math); + const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), + upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); + _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), + ActivationLayerInfo(), enable_fast_math); } } @@ -246,7 +280,7 @@ void NEDeconvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - if(_do_upsampling) + if (_do_upsampling) { _upsample_f.run(); } @@ -255,7 +289,7 @@ void NEDeconvolutionLayer::run() void NEDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp index 1ec32074a5..766635dfa1 100644 --- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuCast.h" #include @@ -32,16 +33,15 @@ namespace arm_compute { struct NEDepthConvertLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEDepthConvertLayer::NEDepthConvertLayer() - : _impl(std::make_unique()) +NEDepthConvertLayer::NEDepthConvertLayer() : _impl(std::make_unique()) { } -NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default; +NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default; NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default; NEDepthConvertLayer::~NEDepthConvertLayer() = default; @@ -59,7 +59,8 @@ void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, Conve _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy); } -Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +Status +NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) { ARM_COMPUTE_RETURN_ERROR_ON(shift != 0); return cpu::CpuCast::validate(input, output, policy); @@ -67,7 +68,7 @@ Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo void NEDepthConvertLayer::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp index f4a8a17e05..47564059ec 100644 --- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h" diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index 4dabef3bd7..6c085645db 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuDepthwiseConv2d.h" @@ -39,38 +40,35 @@ NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default; struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl { - ITensor *src{ nullptr }; // SRC_0 - ITensor *dst{ nullptr }; // DST_0 - const ITensor *weights - { - nullptr - }; // SRC_1 - const ITensor *biases - { - nullptr - }; // SRC_2 + ITensor *src{nullptr}; // SRC_0 + ITensor *dst{nullptr}; // DST_0 + const ITensor *weights{nullptr}; // SRC_1 + const ITensor *biases{nullptr}; // SRC_2 Tensor permuted_input{}; // INT_0 Tensor permuted_weights{}; // INT_1 Tensor permuted_output{}; // INT_2 Tensor workspace{}; // INT_3 Tensor packed_weights{}; // INT_4 - std::shared_ptr op{ nullptr }; - bool is_prepared{ false }; - bool permute{ false }; + std::shared_ptr op{nullptr}; + bool is_prepared{false}; + bool permute{false}; }; -NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr memory_manager) +NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal( + std::shared_ptr memory_manager) : _memory_group(memory_manager), _impl(std::make_unique()) { } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input, - const ITensor *weights, - const ITensor *biases, - ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure( + ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -82,9 +80,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: _impl->permute = is_nhwc; _impl->op = std::make_unique(); - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; - _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(), - _impl->dst->info(), info); + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; + _impl->op->configure(_impl->src->info(), _impl->weights->info(), + _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info); // Configure pipeline ActivationLayerInfo act_info_to_use = ActivationLayerInfo(); @@ -92,15 +90,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info); bool is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6); - if(!is_activationlayer_enabled) + if (!is_activationlayer_enabled) { act_info_to_use = act_info; } - info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation }; + info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation}; auto dwc_optimized_func = std::make_unique(); - if(is_nhwc) + if (is_nhwc) { auto permute_input = std::make_unique(); auto permute_weights = std::make_unique(); @@ -122,7 +120,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info()); // Configure optimized depthwise - dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info); + dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), + biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), + info); // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC); @@ -133,29 +133,33 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: } else { - dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info); + dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), + biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info); } // Allocate memory based on the internal memory requirements experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace(); - _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment); - _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment); + _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8), + mem_req[0].alignment); + _impl->packed_weights.allocator()->init( + TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment); _memory_group.manage(&_impl->workspace); _memory_group.manage(&_impl->packed_weights); _impl->workspace.allocator()->allocate(); _impl->packed_weights.allocator()->allocate(); } -Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input, - const ITensorInfo *weights, - const ITensorInfo *biases, - const ITensorInfo *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) +Status +NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } @@ -180,15 +184,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { // Permute weights - if(_impl->permute) + if (_impl->permute) { _impl->permuted_weights.allocator()->allocate(); } - if(!_impl->permuted_weights.is_used()) + if (!_impl->permuted_weights.is_used()) { _impl->permuted_weights.allocator()->free(); } @@ -202,14 +206,14 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl Tensor permuted_input{}; Tensor permuted_weights{}; Tensor permuted_output{}; - bool is_prepared{ false }; - bool is_nchw{ false }; - bool is_activationlayer_enabled{ false }; - const ITensor *weights{ nullptr }; - const ITensor *biases{ nullptr }; - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::shared_ptr op{ nullptr }; + bool is_prepared{false}; + bool is_nchw{false}; + bool is_activationlayer_enabled{false}; + const ITensor *weights{nullptr}; + const ITensor *biases{nullptr}; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::shared_ptr op{nullptr}; }; NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric() @@ -217,14 +221,21 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConv { } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info); + _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), + info); _impl->src = input; _impl->dst = output; @@ -236,7 +247,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( ITensor *input_to_use = input; const ITensor *weights_to_use = weights; ITensor *output_to_use = output; - if(_impl->is_nchw) + if (_impl->is_nchw) { auto permute_input = std::make_unique(); auto permute_weights = std::make_unique(); @@ -249,14 +260,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC); weights_to_use = &_impl->permuted_weights; - _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); + _impl->permuted_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); output_to_use = &_impl->permuted_output; } auto depthwise_conv_kernel = std::make_unique(); - depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info); + depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), + biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info); - if(_impl->is_nchw) + if (_impl->is_nchw) { auto permute_output = std::make_unique(); permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U)); @@ -268,11 +281,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( } } -Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } @@ -298,49 +316,64 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr op{ nullptr }; + std::shared_ptr op{nullptr}; }; #endif // DOXYGEN_SKIP_THIS -void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, const Size2D &dilation) +void NEDepthwiseConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation); - ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), - output->info(), conv_info, depth_multiplier, act_info, dilation)); + ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate( + input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info, + depth_multiplier, act_info, dilation)); - const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; _impl->op = std::make_shared(); - _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), - info); - switch(_impl->depth_conv_func) + _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function( + input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info); + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: - _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation); break; case DepthwiseConvolutionFunction::GENERIC: - _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation); break; default: ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); } } -Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } void NEDepthwiseConvolutionLayer::run() { - switch(_impl->depth_conv_func) + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _impl->func_optimized.run(); @@ -355,7 +388,7 @@ void NEDepthwiseConvolutionLayer::run() void NEDepthwiseConvolutionLayer::prepare() { - switch(_impl->depth_conv_func) + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _impl->func_optimized.prepare(); diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp index 83e0131c83..28d19d2950 100644 --- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp +++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp @@ -26,19 +26,19 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/cpu/operators/CpuDequantize.h" namespace arm_compute { struct NEDequantizationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEDequantizationLayer::NEDequantizationLayer() - : _impl(std::make_unique()) +NEDequantizationLayer::NEDequantizationLayer() : _impl(std::make_unique()) { } NEDequantizationLayer::~NEDequantizationLayer() = default; diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp index 1da8b012b3..b347390162 100644 --- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp +++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include @@ -35,24 +36,36 @@ namespace arm_compute { NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false) + : _memory_group(std::move(memory_manager)), + _dequantize(), + _detection_post_process(), + _decoded_scores(), + _run_dequantize(false) { } -void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors, - ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info) +void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, + const ITensor *input_scores, + const ITensor *input_anchors, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection, + DetectionPostProcessLayerInfo info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores); - ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), - output_scores->info(), - num_detection->info(), info)); - ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info); + ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, + output_scores); + ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate( + input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), + output_classes->info(), output_scores->info(), num_detection->info(), info)); + ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, + num_detection, info); const ITensor *input_scores_to_use = input_scores; DetectionPostProcessLayerInfo info_to_use = info; _run_dequantize = is_data_type_quantized(input_box_encoding->info()->data_type()); - if(_run_dequantize) + if (_run_dequantize) { _memory_group.manage(&_decoded_scores); @@ -61,26 +74,37 @@ void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, c input_scores_to_use = &_decoded_scores; // Create a new info struct to avoid dequantizing in the CPP layer - std::array scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() }; - DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(), - scales_values, info.use_regular_nms(), info.detection_per_class(), false); + std::array scales_values{info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), + info.scale_value_w()}; + DetectionPostProcessLayerInfo info_quantized( + info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), + info.num_classes(), scales_values, info.use_regular_nms(), info.detection_per_class(), false); info_to_use = info_quantized; } - _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use); + _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, + output_classes, output_scores, num_detection, info_to_use); _decoded_scores.allocator()->allocate(); } -Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info) +Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_scores, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info) { bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type()); - if(run_dequantize) + if (run_dequantize) { TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32); ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info)); + ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, + output_boxes, output_classes, output_scores, + num_detection, info)); return Status{}; } @@ -90,7 +114,7 @@ void NEDetectionPostProcessLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Decode scores if necessary - if(_run_dequantize) + if (_run_dequantize) { _dequantize.run(); } diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp index ef3d3d6055..f1c2cf969f 100644 --- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp @@ -27,17 +27,18 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/cpu/operators/CpuDirectConv2d.h" namespace arm_compute { struct NEDirectConvolutionLayer::Impl { - ITensor *src{ nullptr }; - const ITensor *weights{ nullptr }; - const ITensor *bias{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src{nullptr}; + const ITensor *weights{nullptr}; + const ITensor *bias{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr memory_manager) @@ -46,17 +47,27 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptrsrc = input; _impl->weights = weights; _impl->bias = bias; _impl->dst = output; _impl->op = std::make_unique(_memory_manager); - _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info); + _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), + conv_info, act_info); } -Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info); diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp index c958adf97c..685ef2d4d7 100644 --- a/src/runtime/NEON/functions/NEElementwiseOperations.cpp +++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h" -#include "arm_compute/core/Validate.h" -#include "src/cpu/operators/CpuElementwise.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuElementwise.h" #include @@ -33,17 +34,16 @@ namespace arm_compute { struct NEElementwiseMax::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseMax::NEElementwiseMax() - : _impl(std::make_unique()) +NEElementwiseMax::NEElementwiseMax() : _impl(std::make_unique()) { } -NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default; +NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default; NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default; NEElementwiseMax::~NEElementwiseMax() = default; @@ -57,7 +57,10 @@ void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *outp _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseMax::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseMax::validate(input1, input2, output); @@ -74,17 +77,16 @@ void NEElementwiseMax::run() struct NEElementwiseMin::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseMin::NEElementwiseMin() - : _impl(std::make_unique()) +NEElementwiseMin::NEElementwiseMin() : _impl(std::make_unique()) { } -NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default; +NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default; NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default; NEElementwiseMin::~NEElementwiseMin() = default; @@ -98,7 +100,10 @@ void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *outp _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseMin::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseMin::validate(input1, input2, output); @@ -115,21 +120,23 @@ void NEElementwiseMin::run() struct NEElementwiseSquaredDiff::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() - : _impl(std::make_unique()) +NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() : _impl(std::make_unique()) { } -NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default; +NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default; NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default; NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff() = default; -void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwiseSquaredDiff::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -139,7 +146,10 @@ void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITens _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output); @@ -156,21 +166,23 @@ void NEElementwiseSquaredDiff::run() struct NEElementwiseDivision::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseDivision::NEElementwiseDivision() - : _impl(std::make_unique()) +NEElementwiseDivision::NEElementwiseDivision() : _impl(std::make_unique()) { } -NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default; +NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default; NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default; NEElementwiseDivision::~NEElementwiseDivision() = default; -void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwiseDivision::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -180,7 +192,10 @@ void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseDivision::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseDivision::validate(input1, input2, output); @@ -197,21 +212,23 @@ void NEElementwiseDivision::run() struct NEElementwisePower::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwisePower::NEElementwisePower() - : _impl(std::make_unique()) +NEElementwisePower::NEElementwisePower() : _impl(std::make_unique()) { } -NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default; +NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default; NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default; NEElementwisePower::~NEElementwisePower() = default; -void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwisePower::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -221,7 +238,10 @@ void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *ou _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwisePower::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwisePower::validate(input1, input2, output); @@ -239,22 +259,22 @@ void NEElementwisePower::run() template struct NEElementwiseComparisonStatic::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr> op{nullptr}; }; template -NEElementwiseComparisonStatic::NEElementwiseComparisonStatic() - : _impl(std::make_unique()) +NEElementwiseComparisonStatic::NEElementwiseComparisonStatic() : _impl(std::make_unique()) { } template NEElementwiseComparisonStatic::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default; -template -NEElementwiseComparisonStatic &NEElementwiseComparisonStatic::operator=(NEElementwiseComparisonStatic &&) = default; -template +template +NEElementwiseComparisonStatic & +NEElementwiseComparisonStatic::operator=(NEElementwiseComparisonStatic &&) = default; +template NEElementwiseComparisonStatic::~NEElementwiseComparisonStatic() = default; template @@ -268,13 +288,15 @@ void NEElementwiseComparisonStatic::configure(ITensor *input1, ITensor *inp } template -Status NEElementwiseComparisonStatic::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +Status NEElementwiseComparisonStatic::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) { return cpu::CpuElementwiseComparisonStatic::validate(input1, input2, output); } template -void NEElementwiseComparisonStatic::run() +void NEElementwiseComparisonStatic::run() { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); @@ -285,17 +307,16 @@ void NEElementwiseComparisonStatic::run() struct NEElementwiseComparison::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseComparison::NEElementwiseComparison() - : _impl(std::make_unique()) +NEElementwiseComparison::NEElementwiseComparison() : _impl(std::make_unique()) { } -NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default; +NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default; NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default; NEElementwiseComparison::~NEElementwiseComparison() = default; @@ -308,7 +329,10 @@ void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITenso _impl->op->configure(input1->info(), input2->info(), output->info(), op); } -Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op) +Status NEElementwiseComparison::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation op) { return cpu::CpuElementwiseComparison::validate(input1, input2, output, op); } diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp index a0674ec320..23a092c407 100644 --- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp +++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp @@ -22,7 +22,9 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h" + #include "src/cpu/operators/CpuElementwiseUnary.h" + #include namespace arm_compute @@ -32,21 +34,20 @@ using OperatorType = cpu::CpuElementwiseUnary; template struct NEElementwiseUnaryLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr cpu_op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr cpu_op{nullptr}; }; template -NEElementwiseUnaryLayer::NEElementwiseUnaryLayer() - : _impl(std::make_unique()) +NEElementwiseUnaryLayer::NEElementwiseUnaryLayer() : _impl(std::make_unique()) { } template NEElementwiseUnaryLayer::~NEElementwiseUnaryLayer() = default; template NEElementwiseUnaryLayer::NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&) = default; -template +template NEElementwiseUnaryLayer &NEElementwiseUnaryLayer::operator=(NEElementwiseUnaryLayer &&) = default; template @@ -65,7 +66,7 @@ Status NEElementwiseUnaryLayer::validate(const ITensorInfo *input, const ITe } template -void NEElementwiseUnaryLayer::run() +void NEElementwiseUnaryLayer::run() { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, _impl->src); diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp index 343b817eba..fb75f9da29 100644 --- a/src/runtime/NEON/functions/NEFFT1D.cpp +++ b/src/runtime/NEON/functions/NEFFT1D.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" @@ -37,7 +38,15 @@ namespace arm_compute NEFFT1D::~NEFFT1D() = default; NEFFT1D::NEFFT1D(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false) + : _memory_group(std::move(memory_manager)), + _digit_reverse_kernel(), + _fft_kernels(), + _scale_kernel(), + _digit_reversed_input(), + _digit_reverse_indices(), + _num_ffts(0), + _axis(0), + _run_scale(false) { } @@ -74,7 +83,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & _fft_kernels.resize(_num_ffts); _axis = config.axis; - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { const unsigned int radix_for_stage = decomposed_vector.at(i); @@ -84,19 +93,21 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); _fft_kernels[i] = std::make_unique(); - _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, + fft_kernel_info); Nx *= radix_for_stage; } // Configure scale kernel - if(_run_scale) + if (_run_scale) { FFTScaleKernelInfo scale_config; scale_config.scale = static_cast(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; _scale_kernel = std::make_unique(); - is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config); + is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) + : _scale_kernel->configure(output, nullptr, scale_config); } // Allocate tensors @@ -113,7 +124,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(std::set({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set({0, 1}).count(config.axis) == 0); // Check if FFT is decomposable const auto supported_radix = NEFFTRadixStageKernel::supported_radix(); @@ -122,7 +133,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { // All combinations are supported except real input with real output (i.e., both input channels set to 1) ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1); @@ -140,13 +151,13 @@ void NEFFT1D::run() NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ)); - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX)); } // Run output scaling - if(_run_scale) + if (_run_scale) { NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY); } diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp index ab422bd2ae..066909221d 100644 --- a/src/runtime/NEON/functions/NEFFT2D.cpp +++ b/src/runtime/NEON/functions/NEFFT2D.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Scheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -33,7 +34,10 @@ namespace arm_compute NEFFT2D::~NEFFT2D() = default; NEFFT2D::NEFFT2D(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor() + : _memory_group(memory_manager), + _first_pass_func(memory_manager), + _second_pass_func(memory_manager), + _first_pass_tensor() { } @@ -78,7 +82,7 @@ Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config)); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp index 0551d756fb..94f85e5ffa 100644 --- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp @@ -25,15 +25,16 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" #include "src/core/NEON/kernels/NEFFTScaleKernel.h" #include "src/core/NEON/kernels/NEPadLayerKernel.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" -#include "src/core/helpers/AutoConfiguration.h" #include "src/core/utils/helpers/fft.h" namespace arm_compute @@ -46,11 +47,11 @@ int pad_decomposable(int N) int pad = 0; bool is_decomposed = false; - while(!is_decomposed) + while (!is_decomposed) { const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix); is_decomposed = !decomposed_vector.empty(); - if(!is_decomposed) + if (!is_decomposed) { ++pad; } @@ -102,8 +103,13 @@ NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr mem } NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default; -void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void NEFFTConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math); @@ -115,21 +121,24 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _has_bias = biases != nullptr; // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); - const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), - pad_decomposable(input_dims.y() + kernel_size.y() - 1)); + const Size2D input_dims = + Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); + const Size2D kernel_size = + Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); + const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), + pad_decomposable(input_dims.y() + kernel_size.y() - 1)); // Tensors to use ITensor *input_to_use = input; const ITensor *weights_to_use = weights; ITensor *output_to_use = _has_bias ? &_bias_output : output; // Permute bias - if(biases != nullptr) + if (biases != nullptr) { _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); _permuted_bias.info()->set_data_layout(DataLayout::NCHW); @@ -137,7 +146,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Permute input if needed _needs_permute = input->info()->data_layout() == DataLayout::NHWC; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); // Configure the function to transform the input tensor from NHWC -> NCHW @@ -158,7 +167,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis); // Pad weights - const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } }; + const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}}; _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w); // Transform weights @@ -166,10 +175,10 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo()); // Pad input - const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } }; + const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}}; _memory_group.manage(&_padded_input); _pad_input_func.configure(input_to_use, &_padded_input, padding_in); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); } @@ -193,7 +202,8 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _memory_group.manage(&_itransformed_output); FFT2DInfo itranform_info; itranform_info.direction = FFTDirection::Inverse; - _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); + _itransformed_output.allocator()->init( + _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info); _output_reduced.allocator()->allocate(); @@ -205,26 +215,29 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Extract correct region const int start_left = kernel_size.x() - conv_info.pad_left() - 1; const int start_top = kernel_size.y() - conv_info.pad_top() - 1; - const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); - const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); - if(_has_bias) + const int end_right = + _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); + const int end_botton = + _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); + if (_has_bias) { _memory_group.manage(&_bias_output); } - else if(_needs_permute) + else if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); } - _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); + _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), + Coordinates(end_right, end_botton)); _reshaped_output.allocator()->allocate(); _itransformed_output.allocator()->allocate(); // Add bias - if(biases != nullptr) + if (biases != nullptr) { output_to_use = output; - if(_needs_permute) + if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); @@ -235,7 +248,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co } // Permute output - if(_needs_permute) + if (_needs_permute) { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); @@ -247,7 +260,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.configure(output, nullptr, act_info); } @@ -260,8 +273,13 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co axis_data[1] = 1; } -Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); @@ -279,11 +297,13 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn const auto strides = conv_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1); ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y()); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || + conv_info.pad_right() != (kernel_size.x() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || + conv_info.pad_bottom() != (kernel_size.y() / 2)); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); @@ -291,13 +311,14 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || + (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); // Validate Activation Layer - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info)); } @@ -313,7 +334,7 @@ void NEFFTConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Transform input - if(_needs_permute) + if (_needs_permute) { _permute_input_func.run(); } @@ -331,17 +352,17 @@ void NEFFTConvolutionLayer::run() _extract_output_func.run(); // Add bias - if(_has_bias) + if (_has_bias) { _bias_add_func.run(); } - if(_needs_permute) + if (_needs_permute) { _permute_output_func.run(); } // Run activation layer - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.run(); } @@ -349,10 +370,10 @@ void NEFFTConvolutionLayer::run() void NEFFTConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Permute bias to NCHW - if(_original_bias != nullptr) + if (_original_bias != nullptr) { _permuted_bias.allocator()->allocate(); _permute_bias_func.run(); @@ -362,7 +383,7 @@ void NEFFTConvolutionLayer::prepare() const ITensor *cur_weights = _original_weights; // Permute weights - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!cur_weights->is_used()); diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp index 43667783bf..bc1d5b7f5c 100644 --- a/src/runtime/NEON/functions/NEFill.cpp +++ b/src/runtime/NEON/functions/NEFill.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEFill.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuFill.h" #include @@ -32,15 +33,14 @@ namespace arm_compute { struct NEFill::Impl { - ITensor *tensor{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *tensor{nullptr}; + std::unique_ptr op{nullptr}; }; -NEFill::NEFill() - : _impl(std::make_unique()) +NEFill::NEFill() : _impl(std::make_unique()) { } -NEFill::NEFill(NEFill &&) = default; +NEFill::NEFill(NEFill &&) = default; NEFill &NEFill::operator=(NEFill &&) = default; NEFill::~NEFill() = default; diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp index d633e340f8..a3ab9c3db4 100644 --- a/src/runtime/NEON/functions/NEFillBorder.cpp +++ b/src/runtime/NEON/functions/NEFillBorder.cpp @@ -25,17 +25,20 @@ #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFillBorderKernel.h" namespace arm_compute { -NEFillBorder::NEFillBorder() - : _border_handler(nullptr) +NEFillBorder::NEFillBorder() : _border_handler(nullptr) { } -void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) +void NEFillBorder::configure(ITensor *input, + unsigned int border_width, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_LOG_PARAMS(input, border_width, border_mode, constant_border_value); _border_handler = std::make_unique(); diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp index f435842634..56db2be3fa 100644 --- a/src/runtime/NEON/functions/NEFlattenLayer.cpp +++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp @@ -24,8 +24,9 @@ #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/cpu/operators/CpuFlatten.h" @@ -33,16 +34,15 @@ namespace arm_compute { struct NEFlattenLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEFlattenLayer::NEFlattenLayer() - : _impl(std::make_unique()) +NEFlattenLayer::NEFlattenLayer() : _impl(std::make_unique()) { } -NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default; +NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default; NEFlattenLayer &NEFlattenLayer::operator=(NEFlattenLayer &&) = default; NEFlattenLayer::~NEFlattenLayer() = default; @@ -51,7 +51,8 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output) ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _impl->src = input; _impl->dst = output; - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_flatten_shape(input->info()))); _impl->op = std::make_unique(); _impl->op->configure(_impl->src->info(), _impl->dst->info()); @@ -60,9 +61,10 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output) Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); } return cpu::CpuFlatten::validate(input, output); diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp index d2dc48a159..112c93c478 100644 --- a/src/runtime/NEON/functions/NEFloor.cpp +++ b/src/runtime/NEON/functions/NEFloor.cpp @@ -24,22 +24,22 @@ #include "arm_compute/runtime/NEON/functions/NEFloor.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuFloor.h" namespace arm_compute { struct NEFloor::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEFloor::NEFloor() - : _impl(std::make_unique()) +NEFloor::NEFloor() : _impl(std::make_unique()) { } -NEFloor::NEFloor(NEFloor &&) = default; +NEFloor::NEFloor(NEFloor &&) = default; NEFloor &NEFloor::operator=(NEFloor &&) = default; NEFloor::~NEFloor() = default; diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index 891487efd3..2656d0fa0f 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuFullyConnected.h" @@ -38,80 +39,90 @@ using namespace arm_compute::experimental; struct NEFullyConnectedLayer::Impl { MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; - const ITensor *original_weights{ nullptr }; + const ITensor *original_weights{nullptr}; ITensorPack run_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - bool is_prepared{ false }; - bool dynamic_weights{ false }; + bool is_prepared{false}; + bool dynamic_weights{false}; }; NEFullyConnectedLayer::~NEFullyConnectedLayer() = default; -NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr memory_manager, IWeightsManager *weights_manager) +NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); _impl->weights_manager = weights_manager; } -void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +void NEFullyConnectedLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), - weights->info(), + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, - output->info(), - fc_info, - weights_info)); + output->info(), fc_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, fc_info); _impl->op = std::make_unique(); _impl->original_weights = weights; _impl->is_prepared = false; - _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info, weights_info); + _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), + fc_info, weights_info); - if(_impl->weights_manager != nullptr) + if (_impl->weights_manager != nullptr) { _impl->weights_manager->manage(_impl->original_weights); } _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); - - _impl->dynamic_weights = - !weights->info()->are_values_constant() && - fc_info.transpose_weights && - !fc_info.are_weights_reshaped && - !fc_info.retain_internal_weights; + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + + _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights && + !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights; } -Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, const ITensorInfo *output, const FullyConnectedLayerInfo &fc_info, - const WeightsInfo &weights_info) +Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const FullyConnectedLayerInfo &fc_info, + const WeightsInfo &weights_info) { - return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info, weights_info); + return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info, + weights_info); } -Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +Status NEFullyConnectedLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { return cpu::CpuFullyConnected::validate(input, weights, biases, output, fc_info, weights_info); } void NEFullyConnectedLayer::run() { - if(!_impl->dynamic_weights) + if (!_impl->dynamic_weights) { prepare(); } @@ -122,7 +133,7 @@ void NEFullyConnectedLayer::run() void NEFullyConnectedLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); @@ -131,13 +142,13 @@ void NEFullyConnectedLayer::prepare() _impl->is_prepared = true; // Handle weights managed infrastructure - if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) + if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) { // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare // This is for cases where multiple functions share the same b (weights) // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference const ITensor *original_b = _impl->original_weights; - if(!original_b->is_used()) + if (!original_b->is_used()) { _impl->weights_manager->pre_mark_as_unused(original_b); } diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp index 6612845d86..f5b8b57dac 100644 --- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp +++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h" @@ -35,29 +36,42 @@ namespace arm_compute { NEFuseBatchNormalization::~NEFuseBatchNormalization() = default; -NEFuseBatchNormalization::NEFuseBatchNormalization() - : _fuse_bn_kernel() +NEFuseBatchNormalization::NEFuseBatchNormalization() : _fuse_bn_kernel() { } -void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, - ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void NEFuseBatchNormalization::configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, - bn_beta, bn_gamma, epsilon, fbn_type); + ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); _fuse_bn_kernel = std::make_unique(); - _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); } -Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } void NEFuseBatchNormalization::run() diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index e51f2f9eb6..934a8250cc 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemm.h" @@ -39,12 +40,12 @@ namespace arm_compute struct NEGEMM::Impl { MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; - const ITensor *original_b{ nullptr }; - bool is_prepared{ false }; + const ITensor *original_b{nullptr}; + bool is_prepared{false}; ITensorPack run_pack{}; ITensorPack prep_pack{}; @@ -61,10 +62,17 @@ NEGEMM::NEGEMM(std::shared_ptr memory_manager, IWeightsManager * NEGEMM::~NEGEMM() = default; -void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info) +void NEGEMM::configure(const ITensor *a, + const ITensor *b, + const ITensor *c, + ITensor *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info)); + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, + d->info(), alpha, beta, gemm_info)); // Check if we need to reshape the matrix B only on the first run _impl->is_prepared = false; @@ -73,24 +81,32 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe // Make the B matrix dynamic values. auto b_info_to_use = b->info()->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_info_to_use->set_are_values_constant(false); } - _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info); + _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, + gemm_info); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_SRC_2, c }, { ACL_DST, d } }; - _impl->prep_pack = { { ACL_SRC_1, b }, { ACL_SRC_2, c } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}}; + _impl->prep_pack = {{ACL_SRC_1, b}, {ACL_SRC_2, c}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status NEGEMM::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { // Make the B matrix dynamic values. auto b_to_use = b->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_to_use->set_are_values_constant(false); } @@ -98,8 +114,14 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso return cpu::CpuGemm::validate(a, b_to_use.get(), c, output, alpha, beta, gemm_info); } -Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, - float alpha, float beta, const GEMMInfo &gemm_info) +Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha, beta); return cpu::CpuGemm::has_opt_impl(expected_weight_format, a, b, c, output, gemm_info); @@ -115,15 +137,15 @@ void NEGEMM::run() void NEGEMM::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->original_b->mark_as_unused(); } diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp index 42b8b70405..6cca02eea9 100644 --- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp +++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemmDirectConv2d.h" @@ -35,25 +36,25 @@ using namespace arm_compute::experimental; struct NEGEMMConv2d::Impl { - const ITensor *weights{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *weights{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData workspace{}; MemoryGroup memory_group{}; - bool is_prepared{ false }; + bool is_prepared{false}; experimental::MemoryRequirements aux_mem_req{}; }; -NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr &memory_manager) - : _impl(std::make_unique()) +NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr &memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(memory_manager); } NEGEMMConv2d::~NEGEMMConv2d() = default; -void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) +void NEGEMMConv2d::configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -61,15 +62,21 @@ void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITens _impl->is_prepared = false; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info); + _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + info); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { TensorType::ACL_SRC_0, input }, { TensorType::ACL_SRC_2, biases }, { TensorType::ACL_DST, output } }; - _impl->prep_pack = { { TensorType::ACL_SRC_1, weights }, { TensorType::ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}}; + _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info) +Status NEGEMMConv2d::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv2dInfo &info) { return OperatorType::validate(input, weights, biases, output, info); } @@ -84,15 +91,15 @@ void NEGEMMConv2d::run() void NEGEMMConv2d::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->weights->mark_as_unused(); } diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index fe3ea6a767..c8f65d2fd9 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemmConv2d.h" @@ -36,17 +37,18 @@ namespace arm_compute { struct NEGEMMConvolutionLayer::Impl { - const ITensor *weights{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *weights{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; -NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager, IWeightsManager *weights_manager) +NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->weights_manager = weights_manager; @@ -54,37 +56,61 @@ NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptrweights = weights; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), + conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, input }, - { TensorType::ACL_SRC_1, weights }, - { TensorType::ACL_SRC_2, biases }, - { TensorType::ACL_DST, output } - }; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + {TensorType::ACL_DST, output}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } -Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); } -Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math) +Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + const bool enable_fast_math) { - return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math); + return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info, + dilation, act_info, enable_fast_math); } void NEGEMMConvolutionLayer::run() @@ -96,7 +122,7 @@ void NEGEMMConvolutionLayer::run() void NEGEMMConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 453d3cedef..44bfc6a51e 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -29,8 +29,8 @@ #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" -#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" using namespace arm_compute::experimental; @@ -39,18 +39,19 @@ namespace arm_compute { struct NEGEMMLowpMatrixMultiplyCore::Impl { - const ITensor *b{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *b{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; -NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager, IWeightsManager *weights_manager) +NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->weights_manager = weights_manager; @@ -58,41 +59,41 @@ NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptrinfo()->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_info_to_use->set_are_values_constant(false); } _impl->b = b; _impl->op = std::make_unique(); - _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), gemm_info); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, a }, - { TensorType::ACL_SRC_1, b }, - { TensorType::ACL_SRC_2, c }, - { TensorType::ACL_DST, output } - }; - _impl->prep_pack = - { - { TensorType::ACL_SRC_1, b }, - { TensorType::ACL_SRC_2, c } - }; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), + gemm_info); + _impl->run_pack = {{TensorType::ACL_SRC_0, a}, + {TensorType::ACL_SRC_1, b}, + {TensorType::ACL_SRC_2, c}, + {TensorType::ACL_DST, output}}; + _impl->prep_pack = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { // Make the B matrix dynamic values. auto b_info_to_use = b->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_info_to_use->set_are_values_constant(false); } @@ -109,15 +110,15 @@ void NEGEMMLowpMatrixMultiplyCore::run() void NEGEMMLowpMatrixMultiplyCore::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->b->mark_as_unused(); } diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp index 7e1de3c257..8178003b5e 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp @@ -25,45 +25,48 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuGemmLowpOutputStage.h" namespace arm_compute { struct NEGEMMLowpOutputStage::Impl { - const ITensor *src{ nullptr }; - const ITensor *bias{ nullptr }; - ITensor *dst{ nullptr }; + const ITensor *src{nullptr}; + const ITensor *bias{nullptr}; + ITensor *dst{nullptr}; ITensorPack run_pack{}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; }; -NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() - : _impl(std::make_unique()) +NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() : _impl(std::make_unique()) { } NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default; -void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info) +void NEGEMMLowpOutputStage::configure(const ITensor *input, + const ITensor *bias, + ITensor *output, + const GEMMLowpOutputStageInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON( + NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info)); _impl->src = input; _impl->bias = bias; _impl->dst = output; _impl->op = std::make_unique(); _impl->op->configure(input->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info); - _impl->run_pack = - { - { TensorType::ACL_SRC, _impl->src }, - { TensorType::ACL_BIAS, _impl->bias }, - { TensorType::ACL_DST, _impl->dst } - }; + _impl->run_pack = { + {TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_BIAS, _impl->bias}, {TensorType::ACL_DST, _impl->dst}}; } -Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info) +Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info) { return cpu::CpuGemmLowpOutputStage::validate(input, bias, output, info); } diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp index f5d19c769e..62b8cfa48b 100644 --- a/src/runtime/NEON/functions/NEGather.cpp +++ b/src/runtime/NEON/functions/NEGather.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEGather.h" -#include "src/core/NEON/kernels/NEGatherKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEGatherKernel.h" #include diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp index 1c0e736766..1022b4153e 100644 --- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp +++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp @@ -25,11 +25,12 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h" #include "src/core/NEON/kernels/NEPadLayerKernel.h" -#include "src/core/helpers/AutoConfiguration.h" namespace arm_compute { @@ -68,42 +69,55 @@ NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptrinfo(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), + proposals->info(), scores_out->info(), + num_valid_proposals->info(), info)); ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC; const DataType scores_data_type = scores->info()->data_type(); _is_qasymm8 = scores_data_type == DataType::QASYMM8; - const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); - const int total_num_anchors = num_anchors * feat_width * feat_height; - const int pre_nms_topN = info.pre_nms_topN(); - const int post_nms_topN = info.post_nms_topN(); - const size_t values_per_roi = info.values_per_roi(); + const int num_anchors = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); + const int total_num_anchors = num_anchors * feat_width * feat_height; + const int pre_nms_topN = info.pre_nms_topN(); + const int post_nms_topN = info.post_nms_topN(); + const size_t values_per_roi = info.values_per_roi(); const QuantizationInfo scores_qinfo = scores->info()->quantization_info(); const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type; - const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); + const QuantizationInfo rois_qinfo = + (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); // Compute all the anchors _memory_group.manage(&_all_anchors); _compute_anchors = std::make_unique(); - _compute_anchors->configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); + _compute_anchors->configure(anchors, &_all_anchors, + ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); - _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); + _deltas_flattened.allocator()->init( + TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); // Permute and reshape deltas _memory_group.manage(&_deltas_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); + _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{2, 0, 1}); _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } @@ -117,10 +131,10 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d // Permute and reshape scores _memory_group.manage(&_scores_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); + _permute_scores.configure(scores, &_scores_permuted, PermutationVector{2, 0, 1}); _flatten_scores.configure(&_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } @@ -131,7 +145,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d Tensor *anchors_to_use = &_all_anchors; Tensor *deltas_to_use = &_deltas_flattened; - if(_is_qasymm8) + if (_is_qasymm8) { _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32)); _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32)); @@ -154,11 +168,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d anchors_to_use->allocator()->allocate(); _all_proposals_to_use = &_all_proposals; - if(_is_qasymm8) + if (_is_qasymm8) { _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset - _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); + _all_proposals_quantized.allocator()->init( + TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; @@ -174,7 +189,8 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d // Note that NMS needs outputs preinitialized. auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo); - auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo); + auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, + rois_qinfo); auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32); // Initialize temporaries (unused) outputs @@ -187,17 +203,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d _memory_group.manage(&_proposals_4_roi_values); - const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()); - _cpp_nms.configure(&_scores_flattened /*scores_in*/, - _all_proposals_to_use /*boxes_in,*/, - nullptr /* batch_splits_in*/, - scores_out /* scores_out*/, - &_proposals_4_roi_values /*boxes_out*/, - &_classes_nms_unused /*classes*/, - nullptr /*batch_splits_out*/, - &_keeps_nms_unused /*keeps*/, - num_valid_proposals /* keeps_size*/, - box_nms_info); + const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, + true, min_size_scaled, info.im_width(), info.im_height()); + _cpp_nms.configure(&_scores_flattened /*scores_in*/, _all_proposals_to_use /*boxes_in,*/, + nullptr /* batch_splits_in*/, scores_out /* scores_out*/, &_proposals_4_roi_values /*boxes_out*/, + &_classes_nms_unused /*classes*/, nullptr /*batch_splits_out*/, &_keeps_nms_unused /*keeps*/, + num_valid_proposals /* keeps_size*/, box_nms_info); _keeps_nms_unused.allocator()->allocate(); _classes_nms_unused.allocator()->allocate(); @@ -205,12 +216,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{{1, 0}}); _proposals_4_roi_values.allocator()->allocate(); } -Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, - const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info) +Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, + const ITensorInfo *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -218,9 +234,12 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas); - const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); + const int num_anchors = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); const int num_images = scores->dimension(3); const int total_num_anchors = num_anchors * feat_width * feat_height; const int values_per_roi = info.values_per_roi(); @@ -229,76 +248,100 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16); const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f); } - TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); - - TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true); - TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); - if(scores->data_layout() == DataLayout::NHWC) + TensorInfo all_anchors_info( + anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate( + anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); + + TensorInfo deltas_permuted_info = + deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)) + .set_is_resizable(true); + TensorInfo scores_permuted_info = + scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); + if (scores->data_layout() == DataLayout::NHWC) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 })); - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 })); + ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1})); + ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1})); } - TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo deltas_flattened_info( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info)); - TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); - TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo scores_flattened_info( + scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); + TensorInfo proposals_4_roi_values( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info)); TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values; - TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0)); - if(is_qasymm8) + TensorInfo proposals_4_roi_values_quantized( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16) + .set_quantization_info(QuantizationInfo(0.125f, 0)); + if (is_qasymm8) { - TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); + TensorInfo all_anchors_f32_info(anchors->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info)); - TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); - - TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); - - ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); + TensorInfo deltas_flattened_f32_info(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); + + TensorInfo proposals_4_roi_values_f32(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate( + &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + + ARM_COMPUTE_RETURN_ON_ERROR( + NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized; } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); } - ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } })); + ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}})); - if(num_valid_proposals->total_size() > 0) + if (num_valid_proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32); } - if(proposals->total_size() > 0) + if (proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors)); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16); const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform(); @@ -311,7 +354,7 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens } } - if(scores_out->total_size() > 0) + if (scores_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors)); @@ -330,7 +373,7 @@ void NEGenerateProposalsLayer::run() NEScheduler::get().schedule(_compute_anchors.get(), Window::DimY); // Transpose and reshape the inputs - if(!_is_nhwc) + if (!_is_nhwc) { _permute_deltas.run(); _permute_scores.run(); @@ -339,7 +382,7 @@ void NEGenerateProposalsLayer::run() _flatten_deltas.run(); _flatten_scores.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _dequantize_anchors.run(); _dequantize_deltas.run(); @@ -348,7 +391,7 @@ void NEGenerateProposalsLayer::run() // Build the boxes _bounding_box.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _quantize_all_proposals.run(); } diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp index 822dcf491c..78218cbdee 100644 --- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h" @@ -34,7 +35,13 @@ namespace arm_compute NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default; NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output() + : _memory_group(std::move(memory_manager)), + _normalization_kernel(), + _is_nchw(false), + _permute_input(), + _permute_output(), + _permuted_input(), + _permuted_output() { } @@ -43,14 +50,14 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon); const DataLayout data_layout = input->info()->data_layout(); - const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true }; + const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true}; // Configure Kernels _is_nchw = data_layout == DataLayout::NCHW; _normalization_kernel = std::make_unique(); - if(!_is_nchw) + if (!_is_nchw) { _memory_group.manage(&_permuted_input); _memory_group.manage(&_permuted_output); @@ -72,11 +79,12 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl } } -Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon) +Status NEInstanceNormalizationLayer::validate( + const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon) { - return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW), - &output->clone()->set_data_layout(DataLayout::NCHW), - InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true }); + return NEInstanceNormalizationLayerKernel::validate( + &input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW), + InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true}); } void NEInstanceNormalizationLayer::run() @@ -84,7 +92,7 @@ void NEInstanceNormalizationLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Permute input - if(!_is_nchw) + if (!_is_nchw) { _permute_input.run(); } @@ -92,7 +100,7 @@ void NEInstanceNormalizationLayer::run() NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ); // Permute output - if(!_is_nchw) + if (!_is_nchw) { _permute_output.run(); } diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp index c3ecfb430f..b7f6203efd 100644 --- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp +++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" @@ -69,7 +70,8 @@ Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo sum_sq.set_tensor_shape(shape); const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); - ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); // Reduce shape on axis shape.set(actual_axis, 1); diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp index 428cdf8c04..1a08cdeb06 100644 --- a/src/runtime/NEON/functions/NELSTMLayer.cpp +++ b/src/runtime/NEON/functions/NELSTMLayer.cpp @@ -24,11 +24,12 @@ #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/common/LSTMParams.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -39,42 +40,122 @@ using namespace arm_compute::utils::info_helpers; NELSTMLayer::~NELSTMLayer() = default; NELSTMLayer::NELSTMLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(), - _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(), - _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), - _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(), - _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), - _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), - _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), - _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), - _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), - _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), - _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), + : _memory_group(std::move(memory_manager)), + _fully_connected_input_gate(), + _accum_input_gate1(), + _subtract_input_gate(), + _pixelwise_mul_input_gate(), + _activation_input_gate(), + _fully_connected_forget_gate(), + _accum_forget_gate1(), + _pixelwise_mul_forget_gate(), + _activation_forget_gate(), + _fully_connected_cell_state(), + _gemm_cell_state1(), + _transpose_cell_state(), + _accum_cell_state1(), + _accum_cell_state2(), + _pixelwise_mul_cell_state1(), + _activation_cell_state(), + _cell_clip(), + _pixelwise_mul_cell_state2(), + _fully_connected_output(), + _pixelwise_mul_output_state1(), + _accum_output1(), + _activation_output(), + _activation_output_state(), + _pixelwise_mul_output_state2(), + _fully_connected_output_state(), + _projection_clip(), + _copy_cell_state(), + _copy_output(), + _concat_scratch_buffer(), + _concat_inputs_forget_gate(), + _concat_weights_forget_gate(), + _concat_weights_input_gate(), + _concat_weights_output(), + _mean_std_norm_input_gate(), + _pixelwise_mul_input_gate_coeff(), + _accum_input_gate_bias(), + _mean_std_norm_forget_gate(), + _pixelwise_mul_forget_gate_coeff(), + _accum_forget_gate_bias(), + _mean_std_norm_cell_gate(), + _pixelwise_mul_cell_gate_coeff(), + _accum_cell_gate_bias(), + _mean_std_norm_output_gate(), + _pixelwise_mul_output_gate_coeff(), + _accum_output_gate_bias(), + _input_gate_out1(), + _input_gate_out2(), + _input_gate_out3(), + _input_gate_out4(), + _forget_gate_out1(), + _forget_gate_out2(), + _forget_gate_out3(), + _forget_gate_out4(), + _forget_gate_out5(), + _forget_gate_out6(), + _cell_state_out1(), + _cell_state_out2(), + _cell_state_out3(), + _cell_state_out4(), + _cell_state_out5(), + _output1(), + _output2(), + _output3(), + _output4(), + _cell_state_activation(), + _output_state1(), + _ones(), + _input_layer_norm_out1(), + _input_layer_norm_out2(), + _forget_layer_norm_out1(), + _forget_layer_norm_out2(), + _cell_layer_norm_out1(), + _cell_layer_norm_out2(), + _output_layer_norm_out1(), + _output_layer_norm_out2(), + _run_peephole_opt(false), + _run_cifg_opt(false), + _perform_cell_clipping(false), + _has_projection_weights(false), + _perform_projection_clipping(false), + _is_prepared(false), _is_layer_norm_lstm(false) { } -void NELSTMLayer::configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *output_state_in, const ITensor *cell_state_in, - ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void NELSTMLayer::configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *output_state_in, + const ITensor *cell_state_in, + ITensor *scratch_buffer, + ITensor *output_state_out, + ITensor *cell_state_out, + ITensor *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); - ARM_COMPUTE_LOG_PARAMS(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output, - lstm_params, activation_info, cell_threshold, projection_threshold); + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, + scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + cell_threshold, projection_threshold); _is_layer_norm_lstm = lstm_params.use_layer_norm(); @@ -83,13 +164,12 @@ void NELSTMLayer::configure(const ITensor *input, build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(), - input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - output_state_in->info(), cell_state_in->info(), - scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), - lstm_params_info, activation_info, cell_threshold, projection_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(), + cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), + lstm_params_info, activation_info, cell_threshold, projection_threshold)); const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape(); @@ -116,20 +196,23 @@ void NELSTMLayer::configure(const ITensor *input, _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX); _memory_group.manage(&_forget_gate_out5); - _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); + _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, + (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); _memory_group.manage(&_forget_gate_out1); _memory_group.manage(&_forget_gate_out3); _forget_gate_out6.allocator()->allocate(); Tensor *forget_gate_out = &_forget_gate_out5; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _run_peephole_opt = true; _memory_group.manage(&_forget_gate_out4); - _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); + _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, + ConvertPolicy::SATURATE); _forget_gate_out4.allocator()->allocate(); _forget_gate_out5.allocator()->allocate(); forget_gate_out = &_forget_gate_out3; @@ -138,21 +221,25 @@ void NELSTMLayer::configure(const ITensor *input, { _forget_gate_out3.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_layer_norm_out1); _memory_group.manage(&_forget_layer_norm_out2); _mean_std_norm_forget_gate.configure(forget_gate_out); - _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), + &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before forget_gate_out->allocator()->allocate(); - _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, + ConvertPolicy::SATURATE); _forget_layer_norm_out1.allocator()->allocate(); forget_gate_out = &_forget_layer_norm_out2; } - _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_forget_gate.configure(forget_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG @@ -161,7 +248,7 @@ void NELSTMLayer::configure(const ITensor *input, // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); Tensor *input_gate_out = &_input_gate_out1; - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); @@ -183,15 +270,19 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_input_gate_out1); _memory_group.manage(&_input_gate_out4); - _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); + _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, + (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), + &_input_gate_out3); _input_gate_out2.allocator()->allocate(); input_gate_out = &_input_gate_out3; - if(_run_peephole_opt) + if (_run_peephole_opt) { _memory_group.manage(&_input_gate_out4); - _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, + ConvertPolicy::SATURATE); _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); input_gate_out = &_input_gate_out1; @@ -201,21 +292,25 @@ void NELSTMLayer::configure(const ITensor *input, _input_gate_out1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_input_layer_norm_out1); _memory_group.manage(&_input_layer_norm_out2); _mean_std_norm_input_gate.configure(input_gate_out); - _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), + &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before input_gate_out->allocator()->allocate(); - _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), + &_input_layer_norm_out2, ConvertPolicy::SATURATE); _input_layer_norm_out1.allocator()->allocate(); input_gate_out = &_input_layer_norm_out2; } - _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_input_gate.configure(input_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); } // Configure block that calculates the cell state @@ -228,7 +323,8 @@ void NELSTMLayer::configure(const ITensor *input, _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_out1); - _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); + _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, + &_cell_state_out1); _memory_group.manage(&_cell_state_out2); _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2); _memory_group.manage(&_cell_state_out3); @@ -237,33 +333,40 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_state_out4); _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); Tensor *cell_state_out_ptr = &_cell_state_out4; - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_layer_norm_out1); _memory_group.manage(&_cell_layer_norm_out2); _mean_std_norm_cell_gate.configure(cell_state_out_ptr); - _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), + &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before cell_state_out_ptr->allocator()->allocate(); - _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, + ConvertPolicy::SATURATE); _cell_layer_norm_out1.allocator()->allocate(); cell_state_out_ptr = &_cell_layer_norm_out2; } _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info); _memory_group.manage(&_cell_state_out5); - _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); cell_state_out_ptr->allocator()->allocate(); - _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); _cell_state_out3.allocator()->allocate(); _cell_state_out5.allocator()->allocate(); // Perform clipping - if(cell_threshold != 0.f) + if (cell_threshold != 0.f) { _perform_cell_clipping = true; - _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold)); + _cell_clip.configure(&_cell_state_out1, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold)); } // Configure block that calculates the output @@ -281,18 +384,20 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_output1); _memory_group.manage(&_output4); - _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); + _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, + &_output4); _output2.allocator()->allocate(); _forget_gate_out2.allocator()->allocate(); Tensor *output_gate_out = &_output4; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); _memory_group.manage(&_output3); - _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE); _output4.allocator()->allocate(); output_gate_out = &_output1; @@ -304,21 +409,25 @@ void NELSTMLayer::configure(const ITensor *input, { _output1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_output_layer_norm_out1); _memory_group.manage(&_output_layer_norm_out2); _mean_std_norm_output_gate.configure(output_gate_out); - _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), + &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before output_gate_out->allocator()->allocate(); - _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, + ConvertPolicy::SATURATE); _output_layer_norm_out1.allocator()->allocate(); output_gate_out = &_output_layer_norm_out2; } - _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_output.configure(output_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the output state /** lstm_res = PixelwiseMul(output, Activation(cell_state)) @@ -335,20 +444,24 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_state_activation); _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info); - _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_state_activation.allocator()->allocate(); output_gate_out->allocator()->allocate(); - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { _has_projection_weights = true; - _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); + _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out); _output_state1.allocator()->allocate(); // Perform clipping - if(projection_threshold != 0.f) + if (projection_threshold != 0.f) { _perform_projection_clipping = true; - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); + _projection_clip.configure(output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -projection_threshold, projection_threshold)); } } @@ -358,7 +471,7 @@ void NELSTMLayer::configure(const ITensor *input, // Vector for holding the tensors to store in scratch buffer std::vector scratch_inputs; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { scratch_inputs.emplace_back(input_gate_out); } @@ -372,29 +485,38 @@ void NELSTMLayer::configure(const ITensor *input, output_gate_out->allocator()->allocate(); } -Status NELSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +Status NELSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check dimensions ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); @@ -413,16 +535,16 @@ Status NELSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) - && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && + cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); const unsigned int num_batches = input->dimension(1); const unsigned int num_cells = input_to_output_weights->dimension(1); - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { // If CIFG is used, input layer normalization weights tensor is omitted - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr); } @@ -434,8 +556,12 @@ Status NELSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights()); } - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1); @@ -445,7 +571,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input, } // Check peephole optimization - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1); @@ -465,33 +591,39 @@ Status NELSTMLayer::validate(const ITensorInfo *input, std::vector inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX)); // Validate forget gate - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate input gate - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), - lstm_params.recurrent_to_input_weights(), - lstm_params.input_gate_bias()); + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1); @@ -499,88 +631,120 @@ Status NELSTMLayer::validate(const ITensorInfo *input, std::vector lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); - TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, lstm_params.input_to_input_weights(), + (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), + &input_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(lstm_params.use_layer_norm()) + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(cell_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (cell_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, - -cell_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&cell_state_tmp, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold))); } // Validate output gate tmp std::vector in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); - TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), + &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate output state ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out)); - if(projection_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out)); + if (projection_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, output_state_out, - ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + output_state_out, output_state_out, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, + projection_threshold))); } } @@ -590,7 +754,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input, // Validate scratch concatenation std::vector inputs_vector_info_raw; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { inputs_vector_info_raw.push_back(&input_gate); } @@ -611,12 +775,12 @@ void NELSTMLayer::run() _concat_inputs_forget_gate.run(); _fully_connected_forget_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_forget_gate.run(); _accum_forget_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_forget_gate.run(); _pixelwise_mul_forget_gate_coeff.run(); @@ -624,15 +788,17 @@ void NELSTMLayer::run() } _activation_forget_gate.run(); - if(_run_cifg_opt) + if (_run_cifg_opt) { - if(_ones.info()->data_type() == DataType::F16) + if (_ones.info()->data_type() == DataType::F16) { - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); + std::fill_n(reinterpret_cast(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 1); } else { - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); + std::fill_n(reinterpret_cast(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 1); } _subtract_input_gate.run(); } @@ -640,13 +806,13 @@ void NELSTMLayer::run() { _fully_connected_input_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_input_gate.run(); _accum_input_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_input_gate.run(); _pixelwise_mul_input_gate_coeff.run(); @@ -659,7 +825,7 @@ void NELSTMLayer::run() _transpose_cell_state.run(); _gemm_cell_state1.run(); _accum_cell_state1.run(); - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); _pixelwise_mul_cell_gate_coeff.run(); @@ -671,18 +837,18 @@ void NELSTMLayer::run() _pixelwise_mul_cell_state2.run(); _accum_cell_state2.run(); - if(_perform_cell_clipping) + if (_perform_cell_clipping) { _cell_clip.run(); } _fully_connected_output.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_output_state1.run(); _accum_output1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_output_gate.run(); _pixelwise_mul_output_gate_coeff.run(); @@ -693,10 +859,10 @@ void NELSTMLayer::run() _activation_output_state.run(); _pixelwise_mul_output_state2.run(); - if(_has_projection_weights) + if (_has_projection_weights) { _fully_connected_output_state.run(); - if(_perform_projection_clipping) + if (_perform_projection_clipping) { _projection_clip.run(); } @@ -710,10 +876,10 @@ void NELSTMLayer::run() void NELSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _concat_weights_forget_gate.run(); - if(!_run_cifg_opt) + if (!_run_cifg_opt) { _concat_weights_input_gate.run(); } diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp index cfdeb000e0..41f9c3d700 100644 --- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp +++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp @@ -24,8 +24,9 @@ #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,36 +47,104 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit NELSTMLayerQuantized::~NELSTMLayerQuantized() = default; NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(), - _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(), - _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), - _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), - _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), - _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), - _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), + : _memory_group(std::move(memory_manager)), + _gemmlowp(), + _output_stage(), + _transpose_weights(), + _concat_input_weights(), + _concat_recurrent_weights(), + _concat_weights(), + _concat_inputs(), + _concat_bias(), + _sigmoid_forget_gate(), + _sigmoid_input_gate(), + _sigmoid_output_gate(), + _tanh_modulation_gate(), + _tanh_output_state(), + _add1(), + _add2(), + _mul1(), + _mul2(), + _mul3(), + _slice_input_tensor(), + _slice_forget_tensor(), + _slice_cell_tensor(), + _slice_output_tensor(), + _dequantize(), + _quantize(), + _input_to_input_weights(nullptr), + _input_to_forget_weights(nullptr), + _input_to_cell_weights(nullptr), + _input_to_output_weights(nullptr), + _recurrent_to_input_weights(nullptr), + _recurrent_to_forget_weights(nullptr), + _recurrent_to_cell_weights(nullptr), + _recurrent_to_output_weights(nullptr), + _input_gate_bias(nullptr), + _forget_gate_bias(nullptr), + _cell_bias(nullptr), + _output_gate_bias(nullptr), + _recurrent_weights(), + _input_weights(), + _weights(), + _input(), + _weights_transposed(), + _output_highp(), + _output_lowp(), + _bias(), + _forget_gate_input(), + _input_gate_input(), + _output_gate_input(), + _input_modulation_gate_input(), + _forget_gate_output(), + _input_gate_output(), + _output_gate_output(), + _input_modulation_gate_output(), + _cell_state1(), + _cell_state2(), + _output_state_tmp(), + _output_state_out_symm(), + _output_state_out_f32(), _is_prepared(false) { } void NELSTMLayerQuantized::configure(const ITensor *input, - const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - ITensor *cell_state_in, const ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out) + const ITensor *input_to_input_weights, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_input_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *input_gate_bias, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + ITensor *cell_state_in, + const ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - - ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), - input_to_output_weights->info(), - recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info())); - - ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate( + input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), + recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info())); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, + cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); const int input_size = input->info()->dimension(0); const int batch_size = input->info()->dimension(1); @@ -83,8 +152,10 @@ void NELSTMLayerQuantized::configure(const ITensor *input, const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization - auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); - auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); + auto_init_if_empty(*cell_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); + auto_init_if_empty(*output_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); _input_to_input_weights = input_to_input_weights; _input_to_forget_weights = input_to_forget_weights; @@ -100,34 +171,41 @@ void NELSTMLayerQuantized::configure(const ITensor *input, _output_gate_bias = output_gate_bias; // Weights concatenation - std::vector inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights }; - std::vector recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights }; + std::vector inputs_weights_vector{input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights}; + std::vector recurrent_weights_vector{recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights}; - _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _input_weights.allocator()->init( + TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY); - _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _recurrent_weights.allocator()->init( + TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY); - std::vector weights_vector{ &_recurrent_weights, &_input_weights }; - _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + std::vector weights_vector{&_recurrent_weights, &_input_weights}; + _weights.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_weights.configure(weights_vector, &_weights, Window::DimX); _transpose_weights.configure(&_weights, &_weights_transposed); // Input concatenation - std::vector input_vector{ input, output_state_in }; + std::vector input_vector{input, output_state_in}; _memory_group.manage(&_input); - _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); + _input.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); _concat_inputs.configure(input_vector, &_input, Window::DimX); // Bias concatenation - std::vector bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias }; + std::vector bias_vector{input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias}; _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32)); _concat_bias.configure(bias_vector, &_bias, Window::DimX); // Invert the offset for gemmlowp _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); // Run gemmlowp _memory_group.manage(&_output_highp); @@ -137,7 +215,8 @@ void NELSTMLayerQuantized::configure(const ITensor *input, // Set the offset back _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3)); @@ -159,64 +238,80 @@ void NELSTMLayerQuantized::configure(const ITensor *input, _bias.allocator()->allocate(); // Get the gate tensors - if(batch_size > 1) + if (batch_size > 1) { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0, 0}, {output_size, batch_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size, 0}, + {2 * output_size, batch_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size, 0}, + {4 * output_size, batch_size}); _output_lowp.allocator()->allocate(); } else { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0}, {output_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size}, {2 * output_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size}, + {3 * output_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size}, {4 * output_size}); _output_lowp.allocator()->allocate(); } // Forget gate _memory_group.manage(&_forget_gate_output); - _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_output.allocator()->init( + TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _forget_gate_input.allocator()->allocate(); // Input gate _memory_group.manage(&_input_gate_output); - _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_output.allocator()->init( + TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _input_gate_input.allocator()->allocate(); // Input modulation gate equation _memory_group.manage(&_input_modulation_gate_output); - _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _input_modulation_gate_output.allocator()->init( + TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _input_modulation_gate_input.allocator()->allocate(); // Output gate _memory_group.manage(&_output_gate_output); - _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_output.allocator()->init( + TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _output_gate_input.allocator()->allocate(); // Long term memory _memory_group.manage(&_cell_state1); - _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state1.allocator()->init( + TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _forget_gate_output.allocator()->allocate(); _memory_group.manage(&_cell_state2); - _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state2.allocator()->init( + TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _input_modulation_gate_output.allocator()->allocate(); _input_gate_output.allocator()->allocate(); @@ -226,18 +321,23 @@ void NELSTMLayerQuantized::configure(const ITensor *input, // Short term memory _memory_group.manage(&_output_state_tmp); - _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _output_state_tmp.allocator()->init( + TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_output_state.configure(cell_state_out, &_output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _memory_group.manage(&_output_state_out_symm); - _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _output_state_out_symm.allocator()->init( + TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate_output.allocator()->allocate(); _output_state_tmp.allocator()->allocate(); // Requantize the output state from QSYMM16 to QASYMM8 _memory_group.manage(&_output_state_out_f32); - _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); + _output_state_out_f32.allocator()->init( + TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32); _output_state_out_symm.allocator()->allocate(); @@ -246,15 +346,28 @@ void NELSTMLayerQuantized::configure(const ITensor *input, } Status NELSTMLayerQuantized::validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out) + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, - output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); const int input_size = input->dimension(0); const int batch_size = input->dimension(1); @@ -266,29 +379,51 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); - TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); - TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); - TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); + TensorInfo input_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(input_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(output_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo bias_info( + input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); + TensorInfo output_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QASYMM8) + .set_quantization_info(qasymm)); + TensorInfo cell_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QSYMM16) + .set_quantization_info(qsymm_4)); // Shape checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in); // Data type checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); // Quantization checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); @@ -310,7 +445,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); - ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); // _concat_weights std::vector weights_vector; @@ -320,7 +456,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); // _transpose_weights const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); - TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed)); // _concat_inputs @@ -346,7 +482,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, // _gemmlowp const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); // Set the offset back input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); @@ -357,7 +494,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; int32_t output_multiplier = 0; int32_t output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage GEMMLowpOutputStageInfo info; @@ -372,68 +510,91 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, TensorInfo input_modulation_gate_input; TensorInfo output_gate_input; - if(batch_size > 1) + if (batch_size > 1) { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size})); } else { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, {0}, {output_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size})); } // _sigmoid_forget_gate const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _sigmoid_input_gate const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _tanh_modulation_gate - const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, + qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _sigmoid_output_gate const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&output_gate_input, &output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _mul_forget_gate_cell_state const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); // _mul_input_gate_input_mod_gate const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, + &cell_state_tmp2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _add_cell_state_tmps - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); // _tanh_modulation_gate const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, &output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _mul_output_state_tmp_output_gate const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, + &output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _dequantize const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); @@ -442,14 +603,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, // _quantize ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out)); - if(cell_state_out->total_size() != 0) + if (cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out); } - if(output_state_out->total_size() != 0) + if (output_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out); @@ -508,7 +669,7 @@ void NELSTMLayerQuantized::run() void NELSTMLayerQuantized::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _input_weights.allocator()->allocate(); _concat_input_weights.run(); diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp index 92dcf15791..0013a521d1 100644 --- a/src/runtime/NEON/functions/NELogical.cpp +++ b/src/runtime/NEON/functions/NELogical.cpp @@ -25,6 +25,7 @@ #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NELogicalKernel.h" @@ -32,15 +33,14 @@ namespace arm_compute { struct LogicalArgs { - std::unique_ptr kernel{ nullptr }; + std::unique_ptr kernel{nullptr}; ITensorPack pack{}; }; struct NELogicalAnd::Impl : public LogicalArgs { }; -NELogicalAnd::NELogicalAnd() - : _impl(std::make_unique()) +NELogicalAnd::NELogicalAnd() : _impl(std::make_unique()) { } NELogicalAnd::~NELogicalAnd() = default; @@ -72,8 +72,7 @@ void NELogicalAnd::run() struct NELogicalOr::Impl : public LogicalArgs { }; -NELogicalOr::NELogicalOr() - : _impl(std::make_unique()) +NELogicalOr::NELogicalOr() : _impl(std::make_unique()) { } NELogicalOr::~NELogicalOr() = default; @@ -105,8 +104,7 @@ void NELogicalOr::run() struct NELogicalNot::Impl : public LogicalArgs { }; -NELogicalNot::NELogicalNot() - : _impl(std::make_unique()) +NELogicalNot::NELogicalNot() : _impl(std::make_unique()) { } NELogicalNot::~NELogicalNot() = default; diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp index 58640f40ea..31898bafc4 100644 --- a/src/runtime/NEON/functions/NEMatMul.cpp +++ b/src/runtime/NEON/functions/NEMatMul.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuMatMul.h" @@ -33,23 +34,27 @@ namespace arm_compute { struct NEMatMul::Impl { - const ITensor *lhs{ nullptr }; - const ITensor *rhs{ nullptr }; - ITensor *output{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *lhs{nullptr}; + const ITensor *rhs{nullptr}; + ITensor *output{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; WorkspaceData workspace_tensors{}; ITensorPack run_pack{}; }; -NEMatMul::NEMatMul() - : _impl(std::make_unique()) +NEMatMul::NEMatMul() : _impl(std::make_unique()) { } NEMatMul::~NEMatMul() = default; -void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +void NEMatMul::configure(ITensor *lhs, + ITensor *rhs, + ITensor *output, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { _impl->lhs = lhs; _impl->rhs = rhs; @@ -58,11 +63,16 @@ void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatM ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output); _impl->op = std::make_unique(); _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info); - _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } }; + _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +Status NEMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *output, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info); } diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp index 97ddaea41d..c3861afd2c 100644 --- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEFill.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" #include "src/cpu/operators/CpuMaxUnpooling.h" @@ -35,20 +36,22 @@ namespace arm_compute { struct NEMaxUnpoolingLayer::Impl { - const ITensor *src{ nullptr }; - const ITensor *indices{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + const ITensor *indices{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default; -NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() - : _fill_func(), _impl() +NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() : _fill_func(), _impl() { } -void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info) +void NEMaxUnpoolingLayer::configure(ITensor *input, + ITensor *indices, + ITensor *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info); @@ -64,7 +67,10 @@ void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *o _impl->op->configure(input->info(), indices->info(), output->info(), pool_info); } -Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices); ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuMaxUnpooling::validate(input, indices, output, pool_info)); diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp index 7626aa0db2..dec0dde56d 100644 --- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h" -#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h" namespace arm_compute { diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp index d3b1696335..d6d2e9dc46 100644 --- a/src/runtime/NEON/functions/NENormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NENormalizationLayerKernel.h" @@ -61,13 +62,16 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons _input_squared.allocator()->allocate(); } -Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status NENormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { // Perform validation step ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); return Status{}; } @@ -78,4 +82,4 @@ void NENormalizationLayer::run() _multiply_f.run(); NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY); } -} \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp index 80c5690a4e..963e68bac7 100644 --- a/src/runtime/NEON/functions/NEPReluLayer.cpp +++ b/src/runtime/NEON/functions/NEPReluLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/operators/CpuPRelu.h" namespace arm_compute @@ -32,17 +33,16 @@ using OperatorType = cpu::CpuPRelu; struct NEPReluLayer::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEPReluLayer::NEPReluLayer() - : _impl(std::make_unique()) +NEPReluLayer::NEPReluLayer() : _impl(std::make_unique()) { } -NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default; +NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default; NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default; NEPReluLayer::~NEPReluLayer() = default; diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp index 8bacdd3002..253566df0f 100644 --- a/src/runtime/NEON/functions/NEPadLayer.cpp +++ b/src/runtime/NEON/functions/NEPadLayer.cpp @@ -23,13 +23,13 @@ */ #include "arm_compute/runtime/NEON/functions/NEPadLayer.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/NEPadLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEPadLayerKernel.h" namespace arm_compute { @@ -38,9 +38,9 @@ namespace uint32_t last_padding_dimension(const PaddingList &padding) { int last_padding_dim = padding.size() - 1; - for(; last_padding_dim >= 0; --last_padding_dim) + for (; last_padding_dim >= 0; --last_padding_dim) { - if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0) + if (padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0) { break; } @@ -52,11 +52,22 @@ uint32_t last_padding_dimension(const PaddingList &padding) NEPadLayer::~NEPadLayer() = default; NEPadLayer::NEPadLayer() - : _copy_function(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results() + : _copy_function(), + _pad_kernel(), + _mode(), + _padding(), + _num_dimensions(0), + _slice_functions(), + _concat_functions(), + _slice_results(), + _concat_results() { } -void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value) +void NEPadLayer::configure_constant_mode(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value) { _pad_kernel = std::make_unique(); _pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT); @@ -85,20 +96,20 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu Coordinates ends_after{}; Coordinates strides{}; ITensor *prev = input; - for(uint32_t i = 0; i < _num_dimensions; ++i) + for (uint32_t i = 0; i < _num_dimensions; ++i) { // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again. - if(i > 0) + if (i > 0) { strides.set(i - 1, 1); } - if(_padding[i].first > 0 || _padding[i].second > 0) + if (_padding[i].first > 0 || _padding[i].second > 0) { // Set the starts, ends, and strides values for the current dimension. // Due to the bit masks passed to strided slice, the values below the current dimension in // starts and ends will be ignored so do not need to be modified. - if(_mode == PaddingMode::REFLECT) + if (_mode == PaddingMode::REFLECT) { starts_before.set(i, _padding[i].first); ends_before.set(i, 0); @@ -124,11 +135,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu // Reflect the input values for the padding before and after the input. std::vector concat_vector; - if(_padding[i].first > 0) + if (_padding[i].first > 0) { - if(i < prev->info()->num_dimensions()) + if (i < prev->info()->num_dimensions()) { - _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before); + _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, + begin_mask_before, end_mask_before); concat_vector.emplace_back(&_slice_results[2 * i]); } else @@ -138,11 +150,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } } concat_vector.push_back(prev); - if(_padding[i].second > 0) + if (_padding[i].second > 0) { - if(i < prev->info()->num_dimensions()) + if (i < prev->info()->num_dimensions()) { - _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after); + _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, + strides, begin_mask_after, end_mask_after); concat_vector.emplace_back(&_slice_results[2 * i + 1]); } else @@ -154,12 +167,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu // Concatenate the padding before and after with the input. ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i]; out->info()->set_quantization_info(output->info()->quantization_info()); - for(auto &v : concat_vector) + for (auto &v : concat_vector) { v->info()->set_quantization_info(input->info()->quantization_info()); } _concat_functions[i].configure(concat_vector, out, i); - if(i != _num_dimensions - 1) + if (i != _num_dimensions - 1) { _concat_results[i].allocator()->allocate(); } @@ -170,7 +183,11 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } } -void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +void NEPadLayer::configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode)); ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode); @@ -178,15 +195,16 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p _padding = padding; _mode = mode; - const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding); + const TensorShape padded_shape = + misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape)); // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied. _num_dimensions = last_padding_dimension(padding) + 1; - if(_num_dimensions > 0) + if (_num_dimensions > 0) { - switch(_mode) + switch (_mode) { case PaddingMode::CONSTANT: { @@ -210,19 +228,23 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p } } -Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +Status NEPadLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_UNUSED(constant_value); const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); - if(output->total_size() > 0) + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } - switch(mode) + switch (mode) { case PaddingMode::CONSTANT: { @@ -231,9 +253,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, case PaddingMode::REFLECT: case PaddingMode::SYMMETRIC: { - for(uint32_t i = 0; i < padding.size(); ++i) + for (uint32_t i = 0; i < padding.size(); ++i) { - if(mode == PaddingMode::REFLECT) + if (mode == PaddingMode::REFLECT) { ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i)); ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i)); @@ -256,9 +278,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, void NEPadLayer::run() { - if(_num_dimensions > 0) + if (_num_dimensions > 0) { - switch(_mode) + switch (_mode) { case PaddingMode::CONSTANT: { @@ -268,15 +290,15 @@ void NEPadLayer::run() case PaddingMode::REFLECT: case PaddingMode::SYMMETRIC: { - for(uint32_t i = 0; i < _num_dimensions; ++i) + for (uint32_t i = 0; i < _num_dimensions; ++i) { - if(_padding[i].first > 0 || _padding[i].second > 0) + if (_padding[i].first > 0 || _padding[i].second > 0) { - if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0) + if (_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0) { _slice_functions[2 * i].run(); } - if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0) + if (_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0) { _slice_functions[2 * i + 1].run(); } diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp index 517b86a1cb..80cd04ce6c 100644 --- a/src/runtime/NEON/functions/NEPermute.cpp +++ b/src/runtime/NEON/functions/NEPermute.cpp @@ -24,19 +24,19 @@ #include "arm_compute/runtime/NEON/functions/NEPermute.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuPermute.h" namespace arm_compute { struct NEPermute::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEPermute::NEPermute() - : _impl(std::make_unique()) +NEPermute::NEPermute() : _impl(std::make_unique()) { } diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp index ad83a26beb..97155a9e74 100644 --- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp +++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/operators/CpuMul.h" #include @@ -32,32 +33,42 @@ namespace arm_compute { struct NEPixelWiseMultiplication::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEPixelWiseMultiplication::NEPixelWiseMultiplication() - : _impl(std::make_unique()) +NEPixelWiseMultiplication::NEPixelWiseMultiplication() : _impl(std::make_unique()) { } NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default; -Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } -void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +void NEPixelWiseMultiplication::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info); + _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, + act_info); } void NEPixelWiseMultiplication::run() @@ -71,24 +82,29 @@ void NEPixelWiseMultiplication::run() struct NEComplexPixelWiseMultiplication::Impl { - ITensor *src_0{ nullptr }; - ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src_0{nullptr}; + ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() - : _impl(std::make_unique()) +NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() : _impl(std::make_unique()) { } NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default; -Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return cpu::CpuComplexMul::validate(input1, input2, output, act_info); } -void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEComplexPixelWiseMultiplication::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp index 53f9dbf0a2..e017e8c21d 100644 --- a/src/runtime/NEON/functions/NEPooling3dLayer.cpp +++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuPool3d.h" @@ -33,9 +34,9 @@ namespace arm_compute { struct NEPooling3dLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData workspace_tensors{}; @@ -43,8 +44,7 @@ struct NEPooling3dLayer::Impl NEPooling3dLayer::~NEPooling3dLayer() = default; -NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); } @@ -56,11 +56,12 @@ void NEPooling3dLayer::configure(const ITensor *input, ITensor *output, const Po _impl->op = std::make_unique(); _impl->op->configure(input->info(), output->info(), pool_info); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST_0, _impl->dst}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) +Status +NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) { return cpu::CpuPool3d::validate(input, output, pool_info); } @@ -72,4 +73,4 @@ void NEPooling3dLayer::run() _impl->op->run(_impl->run_pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp index 5a3b9c5e7e..eb9125be3c 100644 --- a/src/runtime/NEON/functions/NEPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuPool2d.h" @@ -33,10 +34,10 @@ namespace arm_compute { struct NEPoolingLayer::Impl { - ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - ITensor *indices{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src{nullptr}; + ITensor *dst{nullptr}; + ITensor *indices{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData workspace_tensors{}; @@ -44,8 +45,7 @@ struct NEPoolingLayer::Impl NEPoolingLayer::~NEPoolingLayer() = default; -NEPoolingLayer::NEPoolingLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEPoolingLayer::NEPoolingLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); } @@ -58,11 +58,16 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay _impl->op = std::make_unique(); _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst }, { TensorType::ACL_DST_1, _impl->indices } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, + {TensorType::ACL_DST_0, _impl->dst}, + {TensorType::ACL_DST_1, _impl->indices}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status NEPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { return cpu::CpuPool2d::validate(input, output, pool_info, indices); } diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp index aba09239cf..dbb6bf9df1 100644 --- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp +++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp @@ -27,15 +27,19 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h" namespace arm_compute { -void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info) +void NEPriorBoxLayer::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info); @@ -44,7 +48,10 @@ void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, IT _kernel = std::move(k); } -Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status NEPriorBoxLayer::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { return NEPriorBoxLayerKernel::validate(input1, input2, output, info); } diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp index 2caaea02d8..dd78d10d16 100644 --- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp +++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp @@ -27,13 +27,14 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h" #include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" namespace arm_compute @@ -41,12 +42,19 @@ namespace arm_compute using namespace arm_compute::utils::info_helpers; namespace { -Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, - float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info) +Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensorInfo *mm_input, + const ITensorInfo *mm_weights, + const ITensorInfo *bias, + float gemmlowp_scale, + const TensorInfo *mm_res_info, + const TensorInfo *outstage_tensor_info) { ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info)); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); return Status{}; } } // namespace @@ -55,10 +63,7 @@ Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf { // Output quantization scale will be different, but ignored here // since it will be configured at configure() stage. - const TensorInfo out - { - in - }; + const TensorInfo out{in}; return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); } @@ -98,14 +103,12 @@ void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst) void NEQLSTMLayer::TensorCopyKernel::run() { - Iterator input_iter{ _src, _window }; - Iterator output_iter{ _dst, _window }; + Iterator input_iter{_src, _window}; + Iterator output_iter{_dst, _window}; - execute_window_loop(_window, [&](const Coordinates &) - { - memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); - }, - input_iter, output_iter); + execute_window_loop( + _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter, + output_iter); } NEQLSTMLayer::~NEQLSTMLayer() = default; @@ -191,10 +194,17 @@ NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr memory_manager) _memory_group = MemoryGroup(std::move(memory_manager)); } -void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, - Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info) +void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, + NEGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensor *mm_input, + const ITensor *mm_weights, + const ITensor *bias, + Tensor *mm_res, + Tensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info) { _memory_group.manage(mm_res); _memory_group.manage(outstage_res); @@ -206,66 +216,87 @@ void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutp mm.configure(mm_input, mm_weights, nullptr, mm_res); // Configure output stage - quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); + quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); outstage.configure(mm_res, bias, outstage_res, gemmlowp_info); mm_res->allocator()->allocate(); } -void NEQLSTMLayer::configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *cell_state_in, ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out, ITensor *output, +void NEQLSTMLayer::configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *cell_state_in, + ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out, + ITensor *output, const LSTMParams &lstm_params) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); // Set lstm parameters LSTMParams lstm_params_info{}; build_lstm_params_tensor_info(lstm_params, &lstm_params_info); - _input_to_forget_weights_transposed.info()->set_quantization_info(input_to_forget_weights->info()->quantization_info()); + _input_to_forget_weights_transposed.info()->set_quantization_info( + input_to_forget_weights->info()->quantization_info()); _input_to_cell_weights_transposed.info()->set_quantization_info(input_to_cell_weights->info()->quantization_info()); - _input_to_output_weights_transposed.info()->set_quantization_info(input_to_output_weights->info()->quantization_info()); - _recurrent_to_forget_weights_transposed.info()->set_quantization_info(recurrent_to_forget_weights->info()->quantization_info()); - _recurrent_to_cell_weights_transposed.info()->set_quantization_info(recurrent_to_cell_weights->info()->quantization_info()); - _recurrent_to_output_weights_transposed.info()->set_quantization_info(recurrent_to_output_weights->info()->quantization_info()); - - if(input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) + _input_to_output_weights_transposed.info()->set_quantization_info( + input_to_output_weights->info()->quantization_info()); + _recurrent_to_forget_weights_transposed.info()->set_quantization_info( + recurrent_to_forget_weights->info()->quantization_info()); + _recurrent_to_cell_weights_transposed.info()->set_quantization_info( + recurrent_to_cell_weights->info()->quantization_info()); + _recurrent_to_output_weights_transposed.info()->set_quantization_info( + recurrent_to_output_weights->info()->quantization_info()); + + if (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) { _convert_input_to_forget_weights_to_qsymm8 = true; // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32 - _input_to_forget_weights_f32.allocator()->init(TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32) - .set_data_layout(input_to_forget_weights->info()->data_layout())); + _input_to_forget_weights_f32.allocator()->init( + TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32) + .set_data_layout(input_to_forget_weights->info()->data_layout())); // Setup the quantize output tensor to go from F32 -> QSYMM8 - _input_to_forget_weights_symm8.allocator()->init((TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8) - .set_data_layout(input_to_forget_weights->info()->data_layout()) - .set_quantization_info(input_to_forget_weights->info()->quantization_info()))); + _input_to_forget_weights_symm8.allocator()->init( + (TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8) + .set_data_layout(input_to_forget_weights->info()->data_layout()) + .set_quantization_info(input_to_forget_weights->info()->quantization_info()))); _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32); _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8); - ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate( + input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), + recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), + output->info(), lstm_params_info)); } else { - ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), + recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), + output->info(), lstm_params_info)); } const int batch_size = input->info()->dimension(1); @@ -277,7 +308,9 @@ void NEQLSTMLayer::configure(const ITensor *input, const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform(); _projection_bias = lstm_params.projection_bias(); - _input_to_forget_weights = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) ? &_input_to_forget_weights_symm8 : input_to_forget_weights; + _input_to_forget_weights = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) + ? &_input_to_forget_weights_symm8 + : input_to_forget_weights; _input_to_cell_weights = input_to_cell_weights; _input_to_output_weights = input_to_output_weights; _recurrent_to_forget_weights = recurrent_to_forget_weights; @@ -287,7 +320,7 @@ void NEQLSTMLayer::configure(const ITensor *input, // Layer normalization _has_layer_norm = lstm_params.use_layer_norm(); - if(_has_layer_norm) + if (_has_layer_norm) { set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget); set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell); @@ -309,22 +342,25 @@ void NEQLSTMLayer::configure(const ITensor *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } _has_cell_clipping = quantized_cell_clip > 0; // Precompute effective bias for optimizing the matmul computations. - if(!_has_cifg) + if (!_has_cifg) { _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); _input_to_input_reduction = std::make_unique(); _recurrent_to_input_reduction = std::make_unique(); - _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction->configure(_recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure( + _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } _input_to_forget_reduction = std::make_unique(); @@ -334,19 +370,31 @@ void NEQLSTMLayer::configure(const ITensor *input, _input_to_output_reduction = std::make_unique(); _recurrent_to_output_reduction = std::make_unique(); - _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction->configure(recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - if(_has_projection) + _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure( + recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure( + recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure( + recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + if (_has_projection) { _projection_reduction = std::make_unique(); - _projection_reduction->configure(_projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); - if(_projection_bias != nullptr) + _projection_reduction->configure( + _projection_weights->info(), _projection_eff_bias.info(), + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + if (_projection_bias != nullptr) { - _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); + _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, + ConvertPolicy::SATURATE); } } @@ -354,15 +402,19 @@ void NEQLSTMLayer::configure(const ITensor *input, _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed); _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed); _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed); - _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); + _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, + &_recurrent_to_forget_weights_transposed); _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); - _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); - if(!_has_cifg) + _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, + &_recurrent_to_output_weights_transposed); + if (!_has_cifg) { - _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); - _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); + _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), + &_input_to_input_weights_transposed); + _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), + &_recurrent_to_input_weights_transposed); } - if(_has_projection) + if (_has_projection) { _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed); } @@ -375,40 +427,52 @@ void NEQLSTMLayer::configure(const ITensor *input, const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); // Forget gate. - const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); - const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, - input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, - &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, - &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.forget_intermediate_scale(); + configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input, + &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res, + &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res, + &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _input_to_forget_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_forget_res); - _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), + &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _cell_to_forget_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_forget_outstage_res); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); + const float cell_to_forget_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, + gemmlowp_info); _mul_cell_to_forget_res.allocator()->allocate(); - _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _cell_to_forget_outstage_res.allocator()->allocate(); } Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Forget, forget_activation_input); forget_activation_input->allocator()->allocate(); @@ -417,33 +481,36 @@ void NEQLSTMLayer::configure(const ITensor *input, // Output quantization info of Sigmoid and Tanh activations const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); - const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); + const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_forget_gate); _forget_gate.allocator()->init(forget_gate_info); - _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); forget_activation_input->allocator()->allocate(); // Modulation gate. - const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, - input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, - &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, + const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.cell_intermediate_scale(); + configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed, + &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info); - const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, - &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, - mm_out_info, cell_outstage_info); + const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res, + &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info); - _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, + &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); _input_to_cell_outstage_res.allocator()->allocate(); Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Cell, cell_activation_input); cell_activation_input->allocator()->allocate(); @@ -454,14 +521,15 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_gate); _cell_gate.allocator()->init(cell_gate_info); - _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); cell_activation_input->allocator()->allocate(); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _input_gate.allocator()->init(input_gate_info); _memory_group.manage(&_input_gate); - if(_has_cifg) + if (_has_cifg) { _ones.allocator()->init(*_forget_gate.info()); _input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); @@ -469,104 +537,137 @@ void NEQLSTMLayer::configure(const ITensor *input, } else { - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, - input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias, - &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale, - mm_out_info, input_outstage_info); - - const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input, + &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res, + &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info); + + const float recurrent_to_input_scale = + _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale, mm_out_info, input_outstage_info); - _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _input_to_input_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); + _mul_cell_to_input_res.allocator()->init( + TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_input_res); - _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), + &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + const float cell_to_input_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_input_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_input_outstage_res); - _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); + _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, + gemmlowp_info); _mul_cell_to_input_res.allocator()->allocate(); - _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _cell_to_input_outstage_res.allocator()->allocate(); } Tensor *input_activation_input = &_recurrent_to_input_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Input, input_activation_input); input_activation_input->allocator()->allocate(); input_activation_input = &get_layer_norm_output(LayerNormGate::Input); } - _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_sigmoid.configure(input_activation_input, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); input_activation_input->allocator()->allocate(); } // Cell. // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication - _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); - const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0)); + const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(mul_input_cell_scale, 0)); _memory_group.manage(&_mul_input_cell_res); _mul_input_cell_res.allocator()->init(mul_input_cell_info); - _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _cell_gate.allocator()->allocate(); _add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); _mul_input_cell_res.allocator()->allocate(); _forget_gate.allocator()->allocate(); - if(_has_cell_clipping) + if (_has_cell_clipping) { - _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); + _cell_clip.configure(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip)); } // Output gate. - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, - input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias, - &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale, - mm_out_info, output_outstage_info); - - const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, - &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale, - mm_out_info, output_outstage_info); - - _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.output_intermediate_scale(); + configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input, + &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res, + &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info); + + const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res, + &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info); + + _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, + &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _input_to_output_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_output_res); - _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - - const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), + &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + + const float cell_to_output_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / + lstm_params.output_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_output_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_output_outstage_res); - _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info); + _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, + gemmlowp_info); _mul_cell_to_output_res.allocator()->allocate(); - _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, + &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _cell_to_output_outstage_res.allocator()->allocate(); } Tensor *output_activation_input = &_recurrent_to_output_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Output, output_activation_input); output_activation_input->allocator()->allocate(); @@ -576,20 +677,24 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_output_gate); _output_gate.allocator()->init(output_gate_info); - _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_sigmoid.configure(output_activation_input, &_output_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); output_activation_input->allocator()->allocate(); // Hidden. - _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _hidden_tanh.configure(cell_state_out, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); - _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate.allocator()->allocate(); _input_gate.allocator()->allocate(); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = output_state_in->info()->data_type(); @@ -598,7 +703,7 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_hidden_gate); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->init(*output_state_out->info()); _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape()); @@ -609,27 +714,26 @@ void NEQLSTMLayer::configure(const ITensor *input, _hidden_mul_res.allocator()->allocate(); // Projection. - if(_has_projection) + if (_has_projection) { const TensorInfo projection_outstage_info(*output_state_out->info()); - const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; - gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); - gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); - gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; - - TensorInfo projection_mm_out_info{ mm_out_info }; + const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; + gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); + gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); + gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; + + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, - hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias, - &_mm_projection_res, &_projection_outstage_res, projection_scale, - projection_mm_out_info, projection_outstage_info); + configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result, + &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res, + &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info); ITensor *accumulate_destination = output_state_out; - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->allocate(); _projection_accumulate_res.allocator()->init(*output_state_in->info()); @@ -638,30 +742,34 @@ void NEQLSTMLayer::configure(const ITensor *input, accumulate_destination = &_projection_accumulate_res; } - _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE); + _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, + ConvertPolicy::SATURATE); _projection_outstage_res.allocator()->allocate(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out); _projection_accumulate_res.allocator()->allocate(); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { - quantized_projection_clip = utility::clamp(lstm_params.projection_clip() / qprojection.scale, -128, 127); + quantized_projection_clip = + utility::clamp(lstm_params.projection_clip() / qprojection.scale, -128, 127); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip)); + _projection_clip.configure(output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip)); _has_projection_clipping = true; } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.configure(_hidden_gate, *output_state_out); _hidden_gate.allocator()->allocate(); @@ -672,17 +780,27 @@ void NEQLSTMLayer::configure(const ITensor *input, _copy_output.configure(output_state_out, output); } -Status NEQLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, +Status NEQLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams &lstm_params) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, - cell_state_out, output_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + cell_state_in, output_state_in, cell_state_out, output_state_out, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions"); @@ -694,22 +812,27 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, + input_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED, DataType::QSYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED, + DataType::QSYMM8); // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED if (input_to_forget_weights->data_type() == DataType::QSYMM8) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); } ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units); @@ -728,20 +851,25 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in); // Check whether peephole weights are all there or none - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); } } @@ -755,7 +883,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } @@ -763,60 +891,90 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Precompute effective bias for optimizing the matmul computations. const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32); const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, - true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.input_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.recurrent_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true))); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_forget_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_cell_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_output_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, - lstm_params.hidden_state_zero(), - true))); - if(lstm_params.projection_bias() != nullptr) + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.projection_weights(), &projection_eff_bias_info, + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); + if (lstm_params.projection_bias() != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, + &projection_eff_bias_info, ConvertPolicy::SATURATE)); } } - const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(), input_to_cell_weights->quantization_info()); - const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1, input_to_output_weights->data_type(), input_to_output_weights->quantization_info()); - const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); - const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_cell_weights->data_type(), recurrent_to_cell_weights->quantization_info()); - const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_output_weights->data_type(), recurrent_to_output_weights->quantization_info()); - const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); + const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(), + input_to_cell_weights->quantization_info()); + const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1, + input_to_output_weights->data_type(), + input_to_output_weights->quantization_info()); + const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); + const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_cell_weights->data_type(), + recurrent_to_cell_weights->quantization_info()); + const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_output_weights->data_type(), + recurrent_to_output_weights->quantization_info()); + const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_to_output_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed)); - if(!lstm_params.has_cifg_opt()) + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed)); + if (!lstm_params.has_cifg_opt()) { - const TensorInfo recurrent_to_input_weights_transposed(TensorShape(num_units, output_size), 1, - recurrent_to_forget_weights->data_type(), lstm_params.recurrent_to_input_weights()->quantization_info()); + const TensorInfo recurrent_to_input_weights_transposed( + TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), + lstm_params.recurrent_to_input_weights()->quantization_info()); const TensorInfo input_to_input_weights_transposed(TensorShape(num_units, input_size), 1, - lstm_params.input_to_input_weights()->data_type(), lstm_params.input_to_input_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed)); + lstm_params.input_to_input_weights()->data_type(), + lstm_params.input_to_input_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed)); } - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); } GEMMLowpOutputStageInfo gemmlowp_info; @@ -829,28 +987,42 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Forget gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0); - const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); - const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_forget_scale, &mm_out_info, &forget_outstage_info)); - const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, + &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_forget_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights(); const ITensorInfo *b_info = forget_gate_bias; @@ -859,22 +1031,31 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Output quantization info of Sigmoid and Tanh activations const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); - const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); + const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Modulation gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0); - const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); - - if(has_layer_norm) + const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_cell_scale, &mm_out_info, &cell_outstage_info)); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, + &cell_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, + &cell_outstage_info, ConvertPolicy::SATURATE)); + + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights(); const ITensorInfo *b_info = cell_bias; @@ -882,94 +1063,134 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, + "Input gate bias must not be present when CIFG is used"); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, + &forget_gate_info, ConvertPolicy::SATURATE)); } else { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED if (input_to_forget_weights->data_type() == DataType::QSYMM8) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights()); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, + lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights()); } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, + lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0); - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info)); - - const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); - - if(lstm_params.has_peephole_opt()) + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_input_scale, &mm_out_info, &input_outstage_info)); + + const float recurrent_to_input_scale = + lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_input_scale, &mm_out_info, + &input_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); + + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, + 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_input_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.input_layer_norm_weights(); const ITensorInfo *b_info = lstm_params.input_gate_bias(); ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&input_outstage_info, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); } // Cell. - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); - if(quantized_cell_clip > 0) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + if (quantized_cell_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, - quantized_cell_clip))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip))); } // Output gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0); - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info)); - - const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_output_scale, &mm_out_info, &output_outstage_info)); + + const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_output_scale, &mm_out_info, + &output_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, + DataType::QSYMM16); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.output_layer_norm_weights(); const ITensorInfo *b_info = output_gate_bias; @@ -977,85 +1198,103 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&output_outstage_info, &output_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Hidden. - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32); const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = hidden_out_info.data_type(); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); const bool projection_tensor_copy_required = num_units != output_size; // Projection. - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, + lstm_params.projection_weights()); ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0); - const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; const TensorInfo projection_outstage_info(*output_state_out); - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); - TensorInfo projection_mm_out_info{ mm_out_info }; + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, + &projection_eff_bias_info, projection_scale, &projection_mm_out_info, &projection_outstage_info)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, + ConvertPolicy::SATURATE)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip))); } } else { - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out)); } } - if(cell_state_out->total_size() > 0) + if (cell_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out); } - if(output_state_out->total_size() > 0) + if (output_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out); @@ -1080,14 +1319,14 @@ void NEQLSTMLayer::run() _recurrent_to_forget_outstage.run(); _accumulate_input_recurrent_forget.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_forget.run(); _cell_to_forget_outstage.run(); _accumulate_cell_forget.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY); } @@ -1102,7 +1341,7 @@ void NEQLSTMLayer::run() _recurrent_to_cell_outstage.run(); _accumulate_input_recurrent_modulation.run(); - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY); } @@ -1110,7 +1349,7 @@ void NEQLSTMLayer::run() _cell_gate_tanh.run(); // Input gate - if(_has_cifg) + if (_has_cifg) { _input_gate_sub.run(); } @@ -1122,14 +1361,14 @@ void NEQLSTMLayer::run() _recurrent_to_input_outstage.run(); _accumulate_input_recurrent_input.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_input.run(); _cell_to_input_outstage.run(); _accumulate_cell_input.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY); } @@ -1142,7 +1381,7 @@ void NEQLSTMLayer::run() _pixelwise_mul_input_cell.run(); _add_forget_cell.run(); - if(_has_cell_clipping) + if (_has_cell_clipping) { _cell_clip.run(); } @@ -1153,14 +1392,14 @@ void NEQLSTMLayer::run() _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); _accumulate_input_recurrent_output.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_output.run(); _cell_to_output_outstage.run(); _accumulate_cell_to_output.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY); } @@ -1173,31 +1412,31 @@ void NEQLSTMLayer::run() _hidden_outstage.run(); // Projection. - if(_has_projection) + if (_has_projection) { _mm_projection.run(); _projection_outstage.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_output_to_accumulate_copy.run(); } _accumulate_projection.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.run(); } - if(_has_projection_clipping) + if (_has_projection_clipping) { _projection_clip.run(); } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.run(); } @@ -1209,9 +1448,9 @@ void NEQLSTMLayer::run() void NEQLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { - if(_convert_input_to_forget_weights_to_qsymm8) + if (_convert_input_to_forget_weights_to_qsymm8) { _input_to_forget_weights_f32.allocator()->allocate(); _input_to_forget_weights_symm8.allocator()->allocate(); @@ -1234,28 +1473,25 @@ void NEQLSTMLayer::prepare() _transpose_recurrent_to_output_weights.run(); // Precompute effective biases - if(_has_cifg) + if (_has_cifg) { - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767); + std::fill_n(reinterpret_cast(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 32767); } else { _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - ITensorPack packII = - { - { TensorType::ACL_SRC, _input_to_input_weights }, - { TensorType::ACL_DST, &_input_to_input_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, _input_to_input_reduction->window(), packII); + ITensorPack packII = {{TensorType::ACL_SRC, _input_to_input_weights}, + {TensorType::ACL_DST, &_input_to_input_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, + _input_to_input_reduction->window(), packII); - ITensorPack packRI = - { - { TensorType::ACL_SRC, _recurrent_to_input_weights }, - { TensorType::ACL_DST, &_recurrent_to_input_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, _recurrent_to_input_reduction->window(), packRI); + ITensorPack packRI = {{TensorType::ACL_SRC, _recurrent_to_input_weights}, + {TensorType::ACL_DST, &_recurrent_to_input_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, + _recurrent_to_input_reduction->window(), packRI); _input_to_input_weights_transposed.allocator()->allocate(); _recurrent_to_input_weights_transposed.allocator()->allocate(); @@ -1271,58 +1507,44 @@ void NEQLSTMLayer::prepare() _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - ITensorPack packIF = - { - { TensorType::ACL_SRC, _input_to_forget_weights }, - { TensorType::ACL_DST, &_input_to_forget_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, _input_to_forget_reduction->window(), packIF); - - ITensorPack packRF = - { - { TensorType::ACL_SRC, _recurrent_to_forget_weights }, - { TensorType::ACL_DST, &_recurrent_to_forget_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, _recurrent_to_forget_reduction->window(), packRF); - - ITensorPack packIC = - { - { TensorType::ACL_SRC, _input_to_cell_weights }, - { TensorType::ACL_DST, &_input_to_cell_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), packIC); - - ITensorPack packRC = - { - { TensorType::ACL_SRC, _recurrent_to_cell_weights }, - { TensorType::ACL_DST, &_recurrent_to_cell_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, _recurrent_to_cell_reduction->window(), packRC); - - ITensorPack packIO = - { - { TensorType::ACL_SRC, _input_to_output_weights }, - { TensorType::ACL_DST, &_input_to_output_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, _input_to_output_reduction->window(), packIO); - - ITensorPack packRO = - { - { TensorType::ACL_SRC, _recurrent_to_output_weights }, - { TensorType::ACL_DST, &_recurrent_to_output_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, _recurrent_to_output_reduction->window(), packRO); - - if(_has_projection) + ITensorPack packIF = {{TensorType::ACL_SRC, _input_to_forget_weights}, + {TensorType::ACL_DST, &_input_to_forget_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, + _input_to_forget_reduction->window(), packIF); + + ITensorPack packRF = {{TensorType::ACL_SRC, _recurrent_to_forget_weights}, + {TensorType::ACL_DST, &_recurrent_to_forget_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, + _recurrent_to_forget_reduction->window(), packRF); + + ITensorPack packIC = {{TensorType::ACL_SRC, _input_to_cell_weights}, + {TensorType::ACL_DST, &_input_to_cell_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), + packIC); + + ITensorPack packRC = {{TensorType::ACL_SRC, _recurrent_to_cell_weights}, + {TensorType::ACL_DST, &_recurrent_to_cell_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, + _recurrent_to_cell_reduction->window(), packRC); + + ITensorPack packIO = {{TensorType::ACL_SRC, _input_to_output_weights}, + {TensorType::ACL_DST, &_input_to_output_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, + _input_to_output_reduction->window(), packIO); + + ITensorPack packRO = {{TensorType::ACL_SRC, _recurrent_to_output_weights}, + {TensorType::ACL_DST, &_recurrent_to_output_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, + _recurrent_to_output_reduction->window(), packRO); + + if (_has_projection) { _projection_eff_bias.allocator()->allocate(); - ITensorPack pack = - { - { TensorType::ACL_SRC, _projection_weights }, - { TensorType::ACL_DST, &_projection_eff_bias } - }; - NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), pack); - if(_projection_bias != nullptr) + ITensorPack pack = {{TensorType::ACL_SRC, _projection_weights}, + {TensorType::ACL_DST, &_projection_eff_bias}}; + NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), + pack); + if (_projection_bias != nullptr) { _projection_bias_add.run(); _projection_bias->mark_as_unused(); @@ -1332,7 +1554,7 @@ void NEQLSTMLayer::prepare() _transpose_projection_weights.run(); _projection_weights->mark_as_unused(); - if(!_projection_tensor_copy_required) + if (!_projection_tensor_copy_required) { _hidden_gate.mark_as_unused(); _projection_accumulate_res.mark_as_unused(); diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp index dad246ac89..9b72783c97 100644 --- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp +++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp @@ -26,19 +26,19 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/cpu/operators/CpuQuantize.h" namespace arm_compute { struct NEQuantizationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEQuantizationLayer::NEQuantizationLayer() - : _impl(std::make_unique()) +NEQuantizationLayer::NEQuantizationLayer() : _impl(std::make_unique()) { } NEQuantizationLayer::~NEQuantizationLayer() = default; diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp index a66ef3d27a..2824693800 100644 --- a/src/runtime/NEON/functions/NERNNLayer.cpp +++ b/src/runtime/NEON/functions/NERNNLayer.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -37,13 +38,26 @@ namespace arm_compute NERNNLayer::~NERNNLayer() = default; NERNNLayer::NERNNLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_f(), _fully_connected_out(), _gemm_output(), _add_output(), + : _memory_group(std::move(memory_manager)), + _gemm_state_f(), + _add_f(), + _activation(), + _fully_connected(memory_manager), + _copy_f(), + _fully_connected_out(), + _gemm_output(), + _add_output(), _is_prepared(false) { } -Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, - const ITensorInfo *output, const ActivationLayerInfo &info) +Status NERNNLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, + const ActivationLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); @@ -60,24 +74,34 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape()); - auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); + auto shape_info = + TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, + input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; } -void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, +void NERNNLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *recurrent_weights, + const ITensor *bias, + ITensor *hidden_state, + ITensor *output, ActivationLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), + bias->info(), hidden_state->info(), output->info(), info)); ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info); const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); - TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), + hidden_state->info()->dimension(idx_height)); _is_prepared = false; @@ -125,7 +149,7 @@ void NERNNLayer::run() void NERNNLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _fully_connected.prepare(); _gemm_state_f.prepare(); diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp index a9bdb50d95..68bb5d5ef3 100644 --- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp +++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp @@ -29,14 +29,20 @@ namespace arm_compute { -Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIAlignLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info)); return Status{}; } -void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIAlignLayer::configure(const ITensor *input, + const ITensor *rois, + ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp index a24f2aac50..babec4aa92 100644 --- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h" @@ -31,17 +33,22 @@ namespace arm_compute { NEROIPoolingLayer::~NEROIPoolingLayer() = default; -NEROIPoolingLayer::NEROIPoolingLayer() - : _roi_kernel() +NEROIPoolingLayer::NEROIPoolingLayer() : _roi_kernel() { } -Status NEROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { return NEROIPoolingLayerKernel::validate(input, rois, output, pool_info); } -void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIPoolingLayer::configure(const ITensor *input, + const ITensor *rois, + const ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); @@ -53,4 +60,4 @@ void NEROIPoolingLayer::run() { NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp index a6f7be8be0..95492df126 100644 --- a/src/runtime/NEON/functions/NERange.cpp +++ b/src/runtime/NEON/functions/NERange.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NERange.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NERangeKernel.h" @@ -31,8 +32,7 @@ namespace arm_compute { NERange::~NERange() = default; -NERange::NERange() - : _kernel() +NERange::NERange() : _kernel() { } @@ -52,4 +52,4 @@ void NERange::run() { NEScheduler::get().schedule(_kernel.get(), Window::DimX); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp index 9f96479295..d37cf4a8d0 100644 --- a/src/runtime/NEON/functions/NEReduceMean.cpp +++ b/src/runtime/NEON/functions/NEReduceMean.cpp @@ -25,21 +25,24 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/NEReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { namespace { -Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status +validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { ARM_COMPUTE_UNUSED(keep_dims); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); @@ -47,29 +50,29 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax const int input_dims = input->num_dimensions(); Coordinates axis_local = reduction_axis; - for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) + for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i) { //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast(input->num_dimensions()))); ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast(input->num_dimensions())); } - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { // Only validate if not using auto_init for the output tensor TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); ARM_COMPUTE_RETURN_ERROR_ON(static_cast(axis_local[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) + if (output->total_size() > 0 && keep_dims) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); } - if(keep_dims) + if (keep_dims) { out_shape.set(axis_local[i], 1); } @@ -91,11 +94,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax NEReduceMean::~NEReduceMean() = default; NEReduceMean::NEReduceMean(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims() + : _memory_group(std::move(memory_manager)), + _reduction_kernels(), + _reduced_outs(), + _reshape(), + _reduction_ops(), + _keep_dims() { } -Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status NEReduceMean::validate(const ITensorInfo *input, + const Coordinates &reduction_axis, + bool keep_dims, + const ITensorInfo *output) { return validate_config(input, reduction_axis, keep_dims, output); } @@ -107,7 +118,8 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); _reduction_ops = reduction_axis.num_dimensions(); @@ -124,37 +136,40 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { - TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + TensorShape out_shape = + i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]); - if(i == _reduction_ops - 1 && keep_dims) + if (i == _reduction_ops - 1 && keep_dims) { _reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM); } else { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), tmp_output->info()->data_type(), tmp_output->info()->quantization_info())); + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), + tmp_output->info()->data_type(), + tmp_output->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); } } // Allocate intermediate tensors - for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } // Configure reshape layer if we want to drop the dimensions - if(!keep_dims) + if (!keep_dims) { TensorShape out_shape = tmp_input->info()->tensor_shape(); // We have to sort the reduction axis vectors in order for remove_dimension // to work properly std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { out_shape.remove_dimension(axis_local[i] - i, false); } @@ -166,11 +181,11 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, void NEReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - for(auto &kernel : _reduction_kernels) + for (auto &kernel : _reduction_kernels) { kernel.run(); } - if(!_keep_dims) + if (!_keep_dims) { _reshape.run(); } diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp index 9660347a16..8540d750fc 100644 --- a/src/runtime/NEON/functions/NEReductionOperation.cpp +++ b/src/runtime/NEON/functions/NEReductionOperation.cpp @@ -26,9 +26,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/NEReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { @@ -42,7 +43,7 @@ namespace */ size_t reduction_window_split_dimension(unsigned int axis) { - switch(axis) + switch (axis) { case 0: return Window::DimY; @@ -59,13 +60,21 @@ size_t reduction_window_split_dimension(unsigned int axis) NEReductionOperation::~NEReductionOperation() = default; NEReductionOperation::NEReductionOperation(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _reduction_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false) + : _memory_group(memory_manager), + _reduction_kernel(), + _reshape(), + _output_internal(), + _window_split(0), + _reduction_axis(), + _is_reshape_required(false) { } -Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) +Status NEReductionOperation::validate( + const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const auto is_reshape_required = !keep_dims; @@ -74,9 +83,10 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf TensorInfo info_before_reshape; - if(is_reshape_required) + if (is_reshape_required) { - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); auto shape_before_reshape = input->tensor_shape(); @@ -84,17 +94,20 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf const auto input_num_channles = input->num_channels(); const auto input_qinfo = input->quantization_info(); - const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); - const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type(); + const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); + const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type(); - info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo); + info_before_reshape.set_data_type(output_data_type) + .set_tensor_shape(shape_before_reshape) + .set_num_channels(input_num_channles) + .set_quantization_info(input_qinfo); output_internal = &info_before_reshape; } ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op)); - if(is_reshape_required) + if (is_reshape_required) { ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output)); } @@ -102,7 +115,8 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf return Status{}; } -void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void NEReductionOperation::configure( + ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims); @@ -112,19 +126,32 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i auto *output_internal = output; const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); - if(_is_reshape_required) + if (_is_reshape_required) { - const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); - const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); - const auto num_channels = input->info()->num_channels(); - const auto qinfo = input->info()->quantization_info(); - - _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels( - num_channels).set_quantization_info(qinfo)); + const auto output_internal_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); + const auto output_external_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); + const auto num_channels = input->info()->num_channels(); + const auto qinfo = input->info()->quantization_info(); + + _output_internal.allocator()->init(input->info() + ->clone() + ->set_data_type(output_data_type) + .set_tensor_shape(output_internal_shape) + .reset_padding() + .set_is_resizable(true) + .set_num_channels(num_channels) + .set_quantization_info(qinfo)); _memory_group.manage(&_output_internal); output_internal = &_output_internal; - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_data_type(output_data_type) + .set_tensor_shape(output_external_shape) + .reset_padding() + .set_is_resizable(true)); } ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims)); @@ -135,7 +162,7 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i _window_split = reduction_window_split_dimension(axis); _reduction_axis = axis; - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.configure(output_internal, output); _output_internal.allocator()->allocate(); @@ -146,7 +173,7 @@ void NEReductionOperation::run() { MemoryGroupResourceScope scope_mg(_memory_group); NEScheduler::get().schedule(_reduction_kernel.get(), _window_split); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.run(); } diff --git a/src/runtime/NEON/functions/NEReorderLayer.cpp b/src/runtime/NEON/functions/NEReorderLayer.cpp index 427bf8c501..89cf575f38 100644 --- a/src/runtime/NEON/functions/NEReorderLayer.cpp +++ b/src/runtime/NEON/functions/NEReorderLayer.cpp @@ -23,20 +23,24 @@ */ #if defined(__aarch64__) -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEReorderLayer.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/kernels/NEReorderKernel.h" namespace arm_compute { NEReorderLayer::~NEReorderLayer() = default; -NEReorderLayer::NEReorderLayer() - : _reorder_kernel(std::make_unique()) +NEReorderLayer::NEReorderLayer() : _reorder_kernel(std::make_unique()) { } -void NEReorderLayer::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +void NEReorderLayer::configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { auto k = std::make_unique(); k->configure(input, output, input_wf, output_wf); @@ -49,11 +53,14 @@ void NEReorderLayer::run() NEScheduler::get().schedule(_reorder_kernel.get(), Window::DimX); } -Status NEReorderLayer::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +Status NEReorderLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { return NEReorderKernel::validate(input, output, input_wf, output_wf); } } // namespace arm_compute -#endif // defined(__aarch64__) \ No newline at end of file +#endif // defined(__aarch64__) diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp index 8ee73d7390..14e41d6df4 100644 --- a/src/runtime/NEON/functions/NEReorgLayer.cpp +++ b/src/runtime/NEON/functions/NEReorgLayer.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h" -#include "src/core/NEON/kernels/NEReorgLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEReorgLayerKernel.h" namespace arm_compute { diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp index 3ccb42361e..bed70ff66c 100644 --- a/src/runtime/NEON/functions/NEReshapeLayer.cpp +++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuReshape.h" #include @@ -32,16 +33,15 @@ namespace arm_compute { struct NEReshapeLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEReshapeLayer::NEReshapeLayer() - : _impl(std::make_unique()) +NEReshapeLayer::NEReshapeLayer() : _impl(std::make_unique()) { } -NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default; +NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default; NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default; NEReshapeLayer::~NEReshapeLayer() = default; diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp index e1988f2ab3..a90f8d2e76 100644 --- a/src/runtime/NEON/functions/NEReverse.cpp +++ b/src/runtime/NEON/functions/NEReverse.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEReverse.h" -#include "src/core/NEON/kernels/NEReverseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEReverseKernel.h" namespace arm_compute { @@ -38,7 +37,10 @@ void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor * _kernel = std::move(k); } -Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) +Status NEReverse::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + bool use_inverted_axis) { return NEReverseKernel::validate(input, output, axis, use_inverted_axis); } diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp index 09f037334e..0d011064f6 100644 --- a/src/runtime/NEON/functions/NEScale.cpp +++ b/src/runtime/NEON/functions/NEScale.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEScale.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/utils/ScaleUtils.h" #include "src/cpu/operators/CpuScale.h" @@ -32,16 +33,16 @@ namespace arm_compute { struct NEScale::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - Tensor dx{ nullptr }; /**< Element's distance between the X real coordinate and the smallest X following integer */ - Tensor dy{ nullptr }; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ - Tensor offsets{ nullptr }; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + Tensor dx{nullptr}; /**< Element's distance between the X real coordinate and the smallest X following integer */ + Tensor dy{nullptr}; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ + Tensor offsets{ + nullptr}; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ + std::unique_ptr op{nullptr}; }; -NEScale::NEScale() - : _impl(std::make_unique()) +NEScale::NEScale() : _impl(std::make_unique()) { } NEScale::~NEScale() = default; @@ -57,25 +58,33 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & // Configure for size of allocation of internal tensors // Get data layout and width/height indices - const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const DataLayout data_layout = + info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio( + input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio( + input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy; + InterpolationPolicy policy_to_use = + (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : info.interpolation_policy; // Get the tensor shape TensorShape shape(output->info()->dimension(idx_width)); shape.set(1, output->info()->dimension(idx_height), false); - bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(data_layout, input->info()->data_type(), policy_to_use, info.border_mode); + bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required( + data_layout, input->info()->data_type(), policy_to_use, info.border_mode); - if(precompute_indices_weights) + if (precompute_indices_weights) { const TensorInfo tensor_info_dxdy(shape, Format::F32); const TensorInfo tensor_info_offsets(shape, Format::S32); @@ -83,7 +92,7 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & _impl->dx.allocator()->init(tensor_info_dxdy); _impl->dy.allocator()->init(tensor_info_dxdy); _impl->offsets.allocator()->init(tensor_info_offsets); - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: { @@ -109,7 +118,8 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & } else { - if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA) + if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && + policy_to_use != InterpolationPolicy::AREA) { ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp index 26c2eb8fe9..55cad2202b 100644 --- a/src/runtime/NEON/functions/NESelect.cpp +++ b/src/runtime/NEON/functions/NESelect.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NESelect.h" #include "arm_compute/core/Types.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESelectKernel.h" diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp index 4a8912bfe9..12d43adc84 100644 --- a/src/runtime/NEON/functions/NESlice.cpp +++ b/src/runtime/NEON/functions/NESlice.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" +#include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStridedSliceKernel.h" @@ -34,7 +35,10 @@ namespace arm_compute { namespace experimental { -void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +void NESlice::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends); @@ -47,15 +51,16 @@ void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coo _kernel = std::move(k); } -Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status NESlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); // Check start dimensions for being non-negative - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) - { - return i < 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; })); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); @@ -66,20 +71,22 @@ Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, co struct NESlice::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NESlice::NESlice() - : _impl(std::make_unique()) +NESlice::NESlice() : _impl(std::make_unique()) { } -NESlice::NESlice(NESlice &&) = default; +NESlice::NESlice(NESlice &&) = default; NESlice &NESlice::operator=(NESlice &&) = default; NESlice::~NESlice() = default; -Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status NESlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { return experimental::NESlice::validate(input, output, starts, ends); } diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index 0947ff94a6..e3c2012d05 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -22,9 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" + #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" #include "src/cpu/kernels/CpuSoftmaxKernel.h" @@ -35,10 +37,10 @@ namespace arm_compute template struct NESoftmaxLayerGeneric::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - Tensor max{ nullptr }; - std::unique_ptr> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + Tensor max{nullptr}; + std::unique_ptr> op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData workspace_tensors{}; @@ -53,9 +55,9 @@ NESoftmaxLayerGeneric::NESoftmaxLayerGeneric(std::shared_ptr NESoftmaxLayerGeneric::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default; -template +template NESoftmaxLayerGeneric &NESoftmaxLayerGeneric::operator=(NESoftmaxLayerGeneric &&) = default; -template +template NESoftmaxLayerGeneric::~NESoftmaxLayerGeneric() = default; template @@ -68,12 +70,13 @@ void NESoftmaxLayerGeneric::configure(ITensor *input, ITensor *output, f _impl->op = std::make_unique>(); _impl->op->configure(input->info(), output->info(), beta, axis); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } template -Status NESoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) +Status +NESoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis)); @@ -81,7 +84,7 @@ Status NESoftmaxLayerGeneric::validate(const ITensorInfo *input, const I } template -void NESoftmaxLayerGeneric::run() +void NESoftmaxLayerGeneric::run() { // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_impl->memory_group); diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp index c4509510dc..556ebdd800 100644 --- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp +++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEFill.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h" @@ -37,17 +38,19 @@ namespace arm_compute { NESpaceToBatchLayer::~NESpaceToBatchLayer() = default; -NESpaceToBatchLayer::NESpaceToBatchLayer() - : _space_to_batch_kernel(), _fill_f(), _has_padding(false) +NESpaceToBatchLayer::NESpaceToBatchLayer() : _space_to_batch_kernel(), _fill_f(), _has_padding(false) { } -void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output) +void NESpaceToBatchLayer::configure(const ITensor *input, + const ITensor *block_shape, + const ITensor *paddings, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; _fill_f = std::make_unique(); @@ -57,11 +60,16 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_s _space_to_batch_kernel->configure(input, block_shape, paddings, output); } -void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output) +void NESpaceToBatchLayer::configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; _fill_f = std::make_unique(); @@ -71,17 +79,25 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_ _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output); } -Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status NESpaceToBatchLayer::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); return Status{}; } -Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status NESpaceToBatchLayer::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -89,7 +105,7 @@ Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s void NESpaceToBatchLayer::run() { // Zero out output only if we have paddings - if(_has_padding) + if (_has_padding) { _fill_f->run(); } diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp index b37bf0d20f..846b619429 100644 --- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp +++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h" @@ -36,8 +37,7 @@ namespace arm_compute { NESpaceToDepthLayer::~NESpaceToDepthLayer() = default; -NESpaceToDepthLayer::NESpaceToDepthLayer() - : _space_to_depth_kernel() +NESpaceToDepthLayer::NESpaceToDepthLayer() : _space_to_depth_kernel() { } diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp index db19bbb824..53b09e9ae5 100644 --- a/src/runtime/NEON/functions/NESplit.cpp +++ b/src/runtime/NEON/functions/NESplit.cpp @@ -34,7 +34,7 @@ namespace arm_compute { void NESplit::run() { - for(unsigned i = 0; i < _num_outputs; ++i) + for (unsigned i = 0; i < _num_outputs; ++i) { _slice_functions[i].run(); } diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp index 68554e0931..03e7026691 100644 --- a/src/runtime/NEON/functions/NEStackLayer.cpp +++ b/src/runtime/NEON/functions/NEStackLayer.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStackLayerKernel.h" @@ -38,9 +39,7 @@ namespace arm_compute NEStackLayer::~NEStackLayer() = default; NEStackLayer::NEStackLayer() // NOLINT - : _input(), - _stack_kernels(), - _num_inputs(0) + : _input(), _stack_kernels(), _num_inputs(0) { } @@ -54,7 +53,7 @@ void NEStackLayer::configure(const std::vector &input, int axis, ITen // Wrap around negative values const unsigned int axis_u = wrap_around(axis, static_cast(input[0]->info()->num_dimensions() + 1)); - for(unsigned int i = 0; i < _num_inputs; i++) + for (unsigned int i = 0; i < _num_inputs; i++) { _stack_kernels[i] = std::make_unique(); _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output); @@ -72,7 +71,7 @@ Status NEStackLayer::validate(const std::vector &input, int axis, const unsigned int num_inputs = input.size(); - for(unsigned int i = 0; i < num_inputs; i++) + for (unsigned int i = 0; i < num_inputs; i++) { // All the tensors must have the same rank ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank); @@ -85,7 +84,7 @@ Status NEStackLayer::validate(const std::vector &input, int axis, void NEStackLayer::run() { - for(unsigned i = 0; i < _num_inputs; i++) + for (unsigned i = 0; i < _num_inputs; i++) { NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY); } diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp index 4f50749a4f..6a3ac8be05 100644 --- a/src/runtime/NEON/functions/NEStridedSlice.cpp +++ b/src/runtime/NEON/functions/NEStridedSlice.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStridedSliceKernel.h" @@ -32,9 +33,14 @@ namespace arm_compute { namespace experimental { -void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void NEStridedSlice::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); @@ -43,9 +49,14 @@ void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output, _kernel = std::move(k); } -Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); } @@ -53,22 +64,26 @@ Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out struct NEStridedSlice::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEStridedSlice::NEStridedSlice() - : _impl(std::make_unique()) +NEStridedSlice::NEStridedSlice() : _impl(std::make_unique()) { } -NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default; +NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default; NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default; NEStridedSlice::~NEStridedSlice() = default; -void NEStridedSlice::configure(const ITensor *input, ITensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void NEStridedSlice::configure(const ITensor *input, + ITensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { _impl->src = input; _impl->dst = output; @@ -84,10 +99,16 @@ void NEStridedSlice::run() _impl->op->run(pack); } -Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp index 526603f1a3..d10b1c8e95 100644 --- a/src/runtime/NEON/functions/NETile.cpp +++ b/src/runtime/NEON/functions/NETile.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NETile.h" -#include "src/core/NEON/kernels/NETileKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NETileKernel.h" namespace arm_compute { diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp index 78c7ea202a..0144a85e8c 100644 --- a/src/runtime/NEON/functions/NETranspose.cpp +++ b/src/runtime/NEON/functions/NETranspose.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NETranspose.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuTranspose.h" @@ -31,13 +32,12 @@ namespace arm_compute { struct NETranspose::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NETranspose::NETranspose() - : _impl(std::make_unique()) +NETranspose::NETranspose() : _impl(std::make_unique()) { } diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp index 0ffab5e92a..2f7ed2bb1f 100644 --- a/src/runtime/NEON/functions/NEUnstack.cpp +++ b/src/runtime/NEON/functions/NEUnstack.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -39,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor) return wrap_around(axis, static_cast(tensor->num_dimensions())); } -inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions) +inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, + int32_t &slice_end_mask, + const unsigned int input_num_dimensions) { // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time. Coordinates slice_end; slice_start.set_num_dimensions(input_num_dimensions); slice_end.set_num_dimensions(input_num_dimensions); - for(size_t k = 0; k < input_num_dimensions; ++k) + for (size_t k = 0; k < input_num_dimensions; ++k) { slice_start.set(k, 0); slice_end.set(k, -1); @@ -55,19 +58,19 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t & } // namespace NEUnstack::NEUnstack() // NOLINT - : _num_slices(0), - _strided_slice_vector() + : _num_slices(0), _strided_slice_vector() { } void NEUnstack::configure(const ITensor *input, const std::vector &output_vector, int axis) { std::vector outputs_vector_info(output_vector.size()); - std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); - }); + std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), + [](ITensor *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t->info(); + }); ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis)); @@ -81,11 +84,12 @@ void NEUnstack::configure(const ITensor *input, const std::vector &ou Coordinates slice_start; int32_t slice_end_mask; setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions()); - for(unsigned int slice = 0; slice < _num_slices; ++slice) + for (unsigned int slice = 0; slice < _num_slices; ++slice) { // Adjusts start and end coordinates to take a 2D slice at a time slice_start.set(axis_u, slice); - _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); + _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, + slice_end_mask, (1 << axis_u)); } } @@ -102,18 +106,20 @@ Status NEUnstack::validate(const ITensorInfo *input, const std::vectortensor_shape().num_dimensions()); - ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input)))); + ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, + (1 << wrap_axis(axis, input)))); } return Status{}; } void NEUnstack::run() { - for(unsigned i = 0; i < _num_slices; ++i) + for (unsigned i = 0; i < _num_slices; ++i) { _strided_slice_vector[i].run(); } diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index a8eded29ff..8d77abcfc7 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -26,15 +26,15 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/MemoryHelpers.h" +#include "src/core/NEON/kernels/convolution/common/utils.hpp" #include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/operators/CpuWinogradConv2d.h" -#include "src/core/NEON/kernels/convolution/common/utils.hpp" - namespace arm_compute { using namespace arm_compute::experimental; @@ -42,14 +42,14 @@ using namespace arm_compute::experimental; struct NEWinogradConvolutionLayer::Impl { MemoryGroup memory_group{}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - const ITensor *original_weights{ nullptr }; - bool is_prepared{ false }; - bool is_activationlayer_enabled{ false }; + const ITensor *original_weights{nullptr}; + bool is_prepared{false}; + bool is_activationlayer_enabled{false}; DataLayout data_layout{}; }; @@ -61,17 +61,24 @@ NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptroriginal_weights = weights; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math); + _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + conv_info, act_info, enable_fast_math); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } void NEWinogradConvolutionLayer::run() @@ -82,15 +89,20 @@ void NEWinogradConvolutionLayer::run() _impl->op->run(_impl->run_pack); } -Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { return cpu::CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math); } void NEWinogradConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); _impl->original_weights->mark_as_unused(); diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp index b0a553212a..d4d6193fce 100644 --- a/src/runtime/OMP/OMPScheduler.cpp +++ b/src/runtime/OMP/OMPScheduler.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" + #include namespace arm_compute @@ -63,7 +64,7 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); const unsigned int num_threads = std::min(num_iterations, _num_threads); - if(!kernel->is_parallelisable() || num_threads == 1) + if (!kernel->is_parallelisable() || num_threads == 1) { ThreadInfo info; info.cpu_info = &cpu_info(); @@ -73,10 +74,10 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win { const unsigned int num_windows = num_threads; std::vector workloads(num_windows); - for(unsigned int t = 0; t < num_windows; t++) + for (unsigned int t = 0; t < num_windows; t++) { //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info) + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info) { Window win = max_window.split_window(hints.split_dimension(), t, num_windows); win.validate(); @@ -92,7 +93,7 @@ void OMPScheduler::run_workloads(std::vector const unsigned int amount_of_work = static_cast(workloads.size()); const unsigned int num_threads_to_use = std::min(_num_threads, amount_of_work); - if(num_threads_to_use < 1) + if (num_threads_to_use < 1) { return; } @@ -100,8 +101,9 @@ void OMPScheduler::run_workloads(std::vector ThreadInfo info; info.cpu_info = &cpu_info(); info.num_threads = num_threads_to_use; - #pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) schedule(static, 1) - for(unsigned int wid = 0; wid < amount_of_work; ++wid) +#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \ + schedule(static, 1) + for (unsigned int wid = 0; wid < amount_of_work; ++wid) { const int tid = omp_get_thread_num(); diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp index a47fa184fa..d746f618b5 100644 --- a/src/runtime/OffsetLifetimeManager.cpp +++ b/src/runtime/OffsetLifetimeManager.cpp @@ -43,8 +43,7 @@ size_t align_offset(size_t offset, size_t alignment) return (remainder != 0U) ? offset + (alignment - remainder) : offset; } } // namespace -OffsetLifetimeManager::OffsetLifetimeManager() - : _blob(0) +OffsetLifetimeManager::OffsetLifetimeManager() : _blob(0) { } @@ -71,21 +70,22 @@ void OffsetLifetimeManager::update_blobs_and_mappings() // Update blob size size_t max_aggregated_size = 0; - std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b) - { - max_aggregated_size += b.max_size; - _blob.alignment = std::max(_blob.alignment, b.max_alignment); - }); + std::for_each(std::begin(_free_blobs), std::end(_free_blobs), + [&](const Blob &b) + { + max_aggregated_size += b.max_size; + _blob.alignment = std::max(_blob.alignment, b.max_alignment); + }); max_aggregated_size += _free_blobs.size() * _blob.alignment; _blob.owners = std::max(_blob.owners, _free_blobs.size()); _blob.size = std::max(_blob.size, max_aggregated_size); // Calculate group mappings - auto &group_mappings = _active_group->mappings(); + auto &group_mappings = _active_group->mappings(); size_t offset = 0; - for(auto &free_blob : _free_blobs) + for (auto &free_blob : _free_blobs) { - for(auto &bound_element_id : free_blob.bound_elements) + for (auto &bound_element_id : free_blob.bound_elements) { ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements)); Element &bound_element = _active_elements[bound_element_id]; diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp index ffedf5586c..8f3c1a84ba 100644 --- a/src/runtime/OffsetMemoryPool.cpp +++ b/src/runtime/OffsetMemoryPool.cpp @@ -21,8 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include - #include "arm_compute/runtime/OffsetMemoryPool.h" #include "arm_compute/core/Error.h" @@ -31,6 +29,8 @@ #include "arm_compute/runtime/MemoryRegion.h" #include "arm_compute/runtime/Types.h" +#include + namespace arm_compute { OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, BlobInfo blob_info) @@ -50,7 +50,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles) ARM_COMPUTE_ERROR_ON(_blob == nullptr); // Set memory to handlers - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second)); @@ -59,7 +59,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles) void OffsetMemoryPool::release(MemoryMappings &handles) { - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(nullptr); diff --git a/src/runtime/OperatorTensor.cpp b/src/runtime/OperatorTensor.cpp index a8ad53da90..19415b35cf 100644 --- a/src/runtime/OperatorTensor.cpp +++ b/src/runtime/OperatorTensor.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/runtime/OperatorTensor.h" + #include "arm_compute/runtime/MemoryRegion.h" #include "support/Cast.h" @@ -47,7 +48,7 @@ ITensorInfo *OperatorTensor::info() uint8_t *OperatorTensor::buffer() const { - switch(_mem_type) + switch (_mem_type) { case MemoryType::CPU: return (uint8_t *)utils::cast::polymorphic_downcast(_memory->region())->buffer(); diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp index 87376a71a4..7fb9bd8000 100644 --- a/src/runtime/PoolManager.cpp +++ b/src/runtime/PoolManager.cpp @@ -31,8 +31,7 @@ using namespace arm_compute; -PoolManager::PoolManager() - : _free_pools(), _occupied_pools(), _sem(), _mtx() +PoolManager::PoolManager() : _free_pools(), _occupied_pools(), _sem(), _mtx() { } @@ -52,10 +51,8 @@ void PoolManager::unlock_pool(IMemoryPool *pool) ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!"); arm_compute::lock_guard lock(_mtx); - auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr &pool_it) - { - return pool_it.get() == pool; - }); + auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), + [pool](const std::unique_ptr &pool_it) { return pool_it.get() == pool; }); ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!"); _free_pools.splice(std::begin(_free_pools), _occupied_pools, it); _sem->signal(); @@ -78,7 +75,7 @@ std::unique_ptr PoolManager::release_pool() arm_compute::lock_guard lock(_mtx); ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to release one!"); - if(!_free_pools.empty()) + if (!_free_pools.empty()) { std::unique_ptr pool = std::move(_free_pools.front()); ARM_COMPUTE_ERROR_ON(_free_pools.front() != nullptr); diff --git a/src/runtime/RuntimeContext.cpp b/src/runtime/RuntimeContext.cpp index d1dea066e7..1de8d2abdb 100644 --- a/src/runtime/RuntimeContext.cpp +++ b/src/runtime/RuntimeContext.cpp @@ -28,8 +28,7 @@ namespace arm_compute { -RuntimeContext::RuntimeContext() - : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get()) +RuntimeContext::RuntimeContext() : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get()) { } diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp index 0713b9a2ad..e52fb59940 100644 --- a/src/runtime/Scheduler.cpp +++ b/src/runtime/Scheduler.cpp @@ -76,7 +76,7 @@ void Scheduler::set(Type t) bool Scheduler::is_available(Type t) { - if(t == Type::CUSTOM) + if (t == Type::CUSTOM) { return _custom_scheduler != nullptr; } @@ -93,11 +93,12 @@ Scheduler::Type Scheduler::get_type() IScheduler &Scheduler::get() { - if(_scheduler_type == Type::CUSTOM) + if (_scheduler_type == Type::CUSTOM) { - if(_custom_scheduler == nullptr) + if (_custom_scheduler == nullptr) { - ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr &scheduler) before Scheduler::get()"); + ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr &scheduler) " + "before Scheduler::get()"); } else { @@ -106,13 +107,13 @@ IScheduler &Scheduler::get() } else { - if(_schedulers.empty()) + if (_schedulers.empty()) { _schedulers = init(); } auto it = _schedulers.find(_scheduler_type); - if(it != _schedulers.end()) + if (it != _schedulers.end()) { return *it->second; } diff --git a/src/runtime/SchedulerFactory.cpp b/src/runtime/SchedulerFactory.cpp index cc21d62630..4fb08d79f5 100644 --- a/src/runtime/SchedulerFactory.cpp +++ b/src/runtime/SchedulerFactory.cpp @@ -48,7 +48,7 @@ const SchedulerFactory::Type SchedulerFactory::_default_type = SchedulerFactory: std::unique_ptr SchedulerFactory::create(Type type) { - switch(type) + switch (type) { case Type::ST: { diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp index 6f9a32c879..74ee539fec 100644 --- a/src/runtime/SchedulerUtils.cpp +++ b/src/runtime/SchedulerUtils.cpp @@ -47,35 +47,34 @@ std::pair split_2d(unsigned max_threads, std::size_t m, std: double ratio = m / static_cast(n); // nt = sqrt(max_threads * (m / n) ) - const unsigned adjusted = std::round( - std::sqrt(max_threads * ratio)); + const unsigned adjusted = std::round(std::sqrt(max_threads * ratio)); //find the nearest factor of max_threads - for(unsigned i = 0; i != adjusted; ++i) + for (unsigned i = 0; i != adjusted; ++i) { //try down const unsigned adj_down = adjusted - i; - if(max_threads % adj_down == 0) + if (max_threads % adj_down == 0) { - return { adj_down, max_threads / adj_down }; + return {adj_down, max_threads / adj_down}; } //try up const unsigned adj_up = adjusted + i; - if(max_threads % adj_up == 0) + if (max_threads % adj_up == 0) { - return { adj_up, max_threads / adj_up }; + return {adj_up, max_threads / adj_up}; } } //we didn't find anything so lets bail out with maxes biased to the largest dimension - if(m > n) + if (m > n) { - return { std::min(m, max_threads), 1 }; + return {std::min(m, max_threads), 1}; } else { - return { 1, std::min(n, max_threads) }; + return {1, std::min(n, max_threads)}; } } #endif /* #ifndef BARE_METAL */ diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp index ae16c8be0a..f87256abb1 100644 --- a/src/runtime/SubTensor.cpp +++ b/src/runtime/SubTensor.cpp @@ -27,8 +27,7 @@ using namespace arm_compute; -SubTensor::SubTensor() - : _parent(nullptr), _info() +SubTensor::SubTensor() : _parent(nullptr), _info() { } diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp index 6dcef9f0b5..f17e323694 100644 --- a/src/runtime/Tensor.cpp +++ b/src/runtime/Tensor.cpp @@ -25,8 +25,7 @@ namespace arm_compute { -Tensor::Tensor(IRuntimeContext *) - : _allocator(this) +Tensor::Tensor(IRuntimeContext *) : _allocator(this) { } diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp index 4ae27c59fc..372852bfea 100644 --- a/src/runtime/TensorAllocator.cpp +++ b/src/runtime/TensorAllocator.cpp @@ -43,13 +43,13 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c const size_t parent_dims = parent_info.num_dimensions(); const size_t child_dims = child_info.num_dimensions(); - if(child_dims <= parent_dims) + if (child_dims <= parent_dims) { - for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions) + for (size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions) { const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1]; - if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1])) + if ((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1])) { is_valid = false; break; @@ -65,8 +65,7 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c } } // namespace -TensorAllocator::TensorAllocator(IMemoryManageable *owner) - : _owner(owner), _associated_memory_group(nullptr), _memory() +TensorAllocator::TensorAllocator(IMemoryManageable *owner) : _owner(owner), _associated_memory_group(nullptr), _memory() { } @@ -88,7 +87,7 @@ TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept { - if(&o != this) + if (&o != this) { _owner = o._owner; o._owner = nullptr; @@ -117,8 +116,10 @@ void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates & _memory = Memory(allocator._memory.region()); // Init tensor info with new dimensions - size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes(); - sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size); + size_t total_size = + parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes(); + sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), + parent_info.offset_element_in_bytes(coords), total_size); // Set TensorInfo init(sub_info); @@ -133,7 +134,7 @@ void TensorAllocator::allocate() { // Align to 64-byte boundaries by default if alignment is not specified const size_t alignment_to_use = (alignment() != 0) ? alignment() : 64; - if(_associated_memory_group == nullptr) + if (_associated_memory_group == nullptr) { _memory.set_owned_region(std::make_unique(info().total_size(), alignment_to_use)); } diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp index 15e9d43a49..a7f7b5f3cb 100644 --- a/src/runtime/Utils.cpp +++ b/src/runtime/Utils.cpp @@ -41,20 +41,17 @@ static const std::string information = const std::string &string_from_scheduler_type(Scheduler::Type t) { - static std::map scheduler_type_map = - { - { Scheduler::Type::ST, "Single Thread" }, - { Scheduler::Type::CPP, "C++11 Threads" }, - { Scheduler::Type::OMP, "OpenMP Threads" }, - { Scheduler::Type::CUSTOM, "Custom" } - }; + static std::map scheduler_type_map = {{Scheduler::Type::ST, "Single Thread"}, + {Scheduler::Type::CPP, "C++11 Threads"}, + {Scheduler::Type::OMP, "OpenMP Threads"}, + {Scheduler::Type::CUSTOM, "Custom"}}; return scheduler_type_map[t]; } void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints) { - if(ctx) + if (ctx) { ARM_COMPUTE_ERROR_ON(ctx->scheduler() == nullptr); ctx->scheduler()->schedule(kernel, hints); @@ -68,7 +65,7 @@ void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const ISch unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis) { // We need only 1 stage for all axis except x-axis - if(axis != 0) + if (axis != 0) { return 1; } diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp index 1bfb8124e9..aba32871d0 100644 --- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include namespace arm_compute @@ -37,25 +38,27 @@ namespace cl_direct_conv { using namespace arm_compute::misc::shape_calculator; -ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) - : IClDirectConvKernelConfig(gpu) +ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu) { } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { - using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info); + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - ClDirectConvConfigArray configs_G71(&ClDirectConvDefaultConfigBifrost::configure_G71_f32, - &ClDirectConvDefaultConfigBifrost::configure_G71_f16, - &ClDirectConvDefaultConfigBifrost::configure_G71_u8); + ClDirectConvConfigArray configs_G71( + &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16, + &ClDirectConvDefaultConfigBifrost::configure_G71_u8); - ClDirectConvConfigArray configs_default(&ClDirectConvDefaultConfigBifrost::configure_default_f32, - &ClDirectConvDefaultConfigBifrost::configure_default_f16, - &ClDirectConvDefaultConfigBifrost::configure_G71_u8); + ClDirectConvConfigArray configs_default( + &ClDirectConvDefaultConfigBifrost::configure_default_f32, + &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G71: func = configs_G71.get_function(src->data_type()); @@ -69,18 +72,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const IT return (this->*func)(src, wei, conv_info); } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 2; } @@ -93,18 +98,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32( return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 4; } @@ -117,18 +124,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16( return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 4; } @@ -141,18 +150,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(c return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 2; } @@ -165,18 +176,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_ return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 4; } @@ -188,5 +201,5 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_ return desc; } -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h index 6b60b2c007..ed6a4c3c68 100644 --- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h @@ -41,15 +41,21 @@ public: ClDirectConvDefaultConfigBifrost(GPUTarget gpu); // Inherited overridden method - DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; private: - DirectConvComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); }; -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */ diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp index 8f2fd82412..4b7666d5aa 100644 --- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include namespace arm_compute @@ -37,25 +38,27 @@ namespace cl_direct_conv { using namespace arm_compute::misc::shape_calculator; -ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) - : IClDirectConvKernelConfig(gpu) +ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu) { } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { - using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info); + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - ClDirectConvConfigArray configs_G78(&ClDirectConvDefaultConfigValhall::configure_G78_f32, - &ClDirectConvDefaultConfigValhall::configure_G78_f16, - &ClDirectConvDefaultConfigValhall::configure_G78_u8); + ClDirectConvConfigArray configs_G78( + &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16, + &ClDirectConvDefaultConfigValhall::configure_G78_u8); - ClDirectConvConfigArray configs_G57(&ClDirectConvDefaultConfigValhall::configure_G57_f32, - &ClDirectConvDefaultConfigValhall::configure_G57_f16, - &ClDirectConvDefaultConfigValhall::configure_G78_u8); + ClDirectConvConfigArray configs_G57( + &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16, + &ClDirectConvDefaultConfigValhall::configure_G78_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G57: func = configs_G57.get_function(src->data_type()); @@ -70,15 +73,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const IT return (this->*func)(src, wei, conv_info); } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape - const TensorShape wei_shape = wei->tensor_shape(); - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); const int32_t ofm = dst_shape[0]; @@ -87,11 +92,11 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32( desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(dst_shape[0] <= 4) + if (dst_shape[0] <= 4) { - if(is_pointwise) + if (is_pointwise) { - if(ofm == 4) + if (ofm == 4) { desc.m0 = 1; desc.n0 = 4; @@ -113,7 +118,7 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32( } else { - if(m < 64) + if (m < 64) { desc.m0 = 1; desc.n0 = 1; @@ -131,15 +136,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32( return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape - const TensorShape wei_shape = wei->tensor_shape(); - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); const int32_t ofm = dst_shape[0]; @@ -149,15 +156,15 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(dst_shape[0] <= 4) + if (dst_shape[0] <= 4) { // k0 should be as larger as possible. However, we should avoid // having left-over for loops that make the implementation slower. - if((k % 16) == 0) + if ((k % 16) == 0) { desc.k0 = 16; } - else if((k % 8) == 0) + else if ((k % 8) == 0) { desc.k0 = 8; } @@ -166,9 +173,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( desc.k0 = 4; } - if(is_pointwise) + if (is_pointwise) { - if(ofm == 4) + if (ofm == 4) { desc.m0 = 1; desc.n0 = 4; @@ -187,15 +194,15 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( } else { - if(m < 64) + if (m < 64) { desc.m0 = 1; desc.n0 = 1; - if((k % 16) == 0) + if ((k % 16) == 0) { desc.k0 = 16; } - else if((k % 8) == 0) + else if ((k % 8) == 0) { desc.k0 = 8; } @@ -206,9 +213,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( } else { - if(ofm >= 16) + if (ofm >= 16) { - if(m / 6 > 24000) + if (m / 6 > 24000) { desc.m0 = 6; } @@ -223,11 +230,11 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( { desc.m0 = 2; desc.n0 = 8; - if((k % 16) == 0) + if ((k % 16) == 0) { desc.k0 = 16; } - else if((k % 8) == 0) + else if ((k % 8) == 0) { desc.k0 = 8; } @@ -243,18 +250,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 4; } @@ -267,15 +276,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(c return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape - const TensorShape wei_shape = wei->tensor_shape(); - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); const int32_t m = dst_shape[1] * dst_shape[2]; @@ -283,9 +294,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32( desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(dst_shape[0] <= 4) + if (dst_shape[0] <= 4) { - if(is_pointwise) + if (is_pointwise) { desc.m0 = 1; desc.n0 = 1; @@ -300,9 +311,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32( } else { - if(m < 64) + if (m < 64) { - if(m == 1) + if (m == 1) { desc.m0 = 1; desc.n0 = 1; @@ -327,15 +338,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32( return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape - const TensorShape wei_shape = wei->tensor_shape(); - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); const int32_t ofm = dst_shape[0]; @@ -344,9 +357,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16( desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(dst_shape[0] <= 4) + if (dst_shape[0] <= 4) { - if(is_pointwise) + if (is_pointwise) { desc.m0 = 2; desc.n0 = 1; @@ -361,9 +374,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16( } else { - if(m < 64) + if (m < 64) { - if(m == 1) + if (m == 1) { desc.m0 = 1; desc.n0 = 1; @@ -378,7 +391,7 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16( } else { - if(ofm > 16) + if (ofm > 16) { desc.m0 = 4; desc.n0 = 8; @@ -396,5 +409,5 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16( return desc; } -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h index f9d5c5299e..efd879a567 100644 --- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h @@ -41,15 +41,21 @@ public: ClDirectConvDefaultConfigValhall(GPUTarget gpu); // Inherited overridden method - DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; private: - DirectConvComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); }; -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h index 232167fc59..2c2509f70b 100644 --- a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h +++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h @@ -46,7 +46,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: return std::make_unique(GPUTarget::G71); @@ -59,6 +59,6 @@ public: } } }; -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG */ diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h index 6104d73594..e5b270c720 100644 --- a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h +++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h @@ -27,6 +27,7 @@ #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" namespace arm_compute @@ -52,8 +53,7 @@ public: * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) * */ - ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) - : _configs{ func_f32, func_f16, func_int8 } + ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} { } @@ -65,7 +65,7 @@ public: */ T get_function(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return _configs.at(DT_F32); @@ -92,8 +92,7 @@ public: * * @param[in] arch GPU target */ - IClDirectConvKernelConfig(GPUTarget arch) - : _target(arch) + IClDirectConvKernelConfig(GPUTarget arch) : _target(arch) { } ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig); @@ -105,11 +104,12 @@ public: * @param[in] wei Weights tensor * @param[in] conv_info Convolution info */ - virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; + virtual DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; protected: GPUTarget _target; }; -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp index 5311fdcec3..98ebf3ebbe 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h" -#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/GPUTarget.h" @@ -30,28 +29,34 @@ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + namespace arm_compute { namespace cl_dwc { namespace { -DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier, bool is_g71) +DWCComputeKernelInfo configure_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + bool is_g71) { DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { - const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); - const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); const TensorShape wei_shape = wei->tensor_shape(); const size_t kernel_c = wei_shape[idx_c]; const size_t kernel_w = wei_shape[idx_w]; desc.export_input_to_cl_image = false; - if(is_g71) + if (is_g71) { desc.export_weights_to_cl_image = false; } @@ -60,17 +65,17 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); } - if(depth_multiplier == 1) + if (depth_multiplier == 1) { desc.n0 = 4; } else { - if((depth_multiplier % 4) == 0) + if ((depth_multiplier % 4) == 0) { desc.n0 = 4; } - else if((depth_multiplier % 2) == 0) + else if ((depth_multiplier % 2) == 0) { desc.n0 = 2; } @@ -81,14 +86,15 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we } // Note: If we reduce n0, export to cl_image must be false - ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); desc.n0 = adjust_vec_size(desc.n0, kernel_c); // Set m0 only if stride_x == 1 and dilation_x == 1 - if(conv_info.stride().first == 1 && dilation.x() == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1) { - if((kernel_w >= 9) || (kernel_w == 1)) + if ((kernel_w >= 9) || (kernel_w == 1)) { desc.m0 = 1; } @@ -106,16 +112,20 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we return desc; } -DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier, bool is_g71) +DWCComputeKernelInfo configure_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + bool is_g71) { DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Src and weights have the same dimension indices - const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); - const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); const TensorShape src_shape = src->tensor_shape(); const TensorShape wei_shape = wei->tensor_shape(); const size_t src_w = src_shape[idx_w]; @@ -124,7 +134,7 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we desc.export_input_to_cl_image = false; - if(is_g71) + if (is_g71) { desc.export_weights_to_cl_image = false; } @@ -133,9 +143,9 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); } - if(depth_multiplier == 1) + if (depth_multiplier == 1) { - if(desc.export_weights_to_cl_image == false) + if (desc.export_weights_to_cl_image == false) { desc.n0 = 8; } @@ -146,11 +156,11 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we } else { - if((depth_multiplier % 4) == 0) + if ((depth_multiplier % 4) == 0) { desc.n0 = 4; } - else if((depth_multiplier % 2) == 0) + else if ((depth_multiplier % 2) == 0) { desc.n0 = 2; } @@ -161,20 +171,21 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we } // Note: If we reduce n0, export to cl_image must be false - ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); desc.n0 = adjust_vec_size(desc.n0, kernel_c); // Set m0 only if stride_x == 1 and dilation_x == 1 - if(conv_info.stride().first == 1 && dilation.x() == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1) { - if((kernel_w >= 9) || (kernel_w == 1)) + if ((kernel_w >= 9) || (kernel_w == 1)) { desc.m0 = 1; } else { - if((src_w % 5) == 0) + if ((src_w % 5) == 0) { desc.m0 = 5; } @@ -194,27 +205,30 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we } } // namespace -ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) - : IClDWCNativeKernelConfig(gpu) +ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu) { } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { - using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); - ClDWCNativeConfigArray configs_G71(&ClDWCNativeDefaultConfigBifrost::configure_G71_f32, - &ClDWCNativeDefaultConfigBifrost::configure_G71_f16, - &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + ClDWCNativeConfigArray configs_G71( + &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); - ClDWCNativeConfigArray configs_G7x(&ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, - &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16, - &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + ClDWCNativeConfigArray configs_G7x( + &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G71: func = configs_G71.get_function(src->data_type()); @@ -228,43 +242,58 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInf return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { ARM_COMPUTE_UNUSED(wei); DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { desc.export_input_to_cl_image = false; desc.export_weights_to_cl_image = false; desc.n0 = (depth_multiplier == 1) ? 4 : 1; - if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) { desc.m0 = 2; } diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h index cec2cae5dd..41d86c9c14 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h @@ -41,20 +41,38 @@ public: ClDWCNativeDefaultConfigBifrost(GPUTarget gpu); // Inherited overridden method - DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) override; + DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) override; private: - DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); }; } // namespace cl_dwc } // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp index 51f3787875..ef1bb3858c 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h" -#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/GPUTarget.h" @@ -30,31 +29,36 @@ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + namespace arm_compute { namespace cl_dwc { -ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) - : IClDWCNativeKernelConfig(gpu) +ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu) { } -DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { - using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); - ClDWCNativeConfigArray configs_G78(&ClDWCNativeDefaultConfigValhall::configure_G78_f32, - &ClDWCNativeDefaultConfigValhall::configure_G78_f16, - &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + ClDWCNativeConfigArray configs_G78( + &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); - ClDWCNativeConfigArray configs_G77(&ClDWCNativeDefaultConfigValhall::configure_G78_f32, - &ClDWCNativeDefaultConfigValhall::configure_G77_f16, - &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + ClDWCNativeConfigArray configs_G77( + &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G77: func = configs_G77.get_function(src->data_type()); @@ -69,15 +73,18 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInf return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { - const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); - const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); const TensorShape wei_shape = wei->tensor_shape(); const size_t kernel_c = wei_shape[idx_c]; const size_t kernel_w = wei_shape[idx_w]; @@ -85,17 +92,17 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT desc.export_input_to_cl_image = false; desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); - if(depth_multiplier == 1) + if (depth_multiplier == 1) { desc.n0 = 4; } else { - if((depth_multiplier % 4) == 0) + if ((depth_multiplier % 4) == 0) { desc.n0 = 4; } - else if((depth_multiplier % 2) == 0) + else if ((depth_multiplier % 2) == 0) { desc.n0 = 2; } @@ -106,14 +113,15 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT } // Note: If we reduce n0, export to cl_image must be false - ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); desc.n0 = adjust_vec_size(desc.n0, kernel_c); // Set m0 only if stride_x == 1 and dilation_x == 1 - if(conv_info.stride().first == 1 && dilation.x() == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1) { - if((kernel_w >= 9) || (kernel_w == 1)) + if ((kernel_w >= 9) || (kernel_w == 1)) { desc.m0 = 1; } @@ -131,16 +139,19 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT return desc; } -DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Src and weights have the same dimension indices - const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); - const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); const TensorShape src_shape = src->tensor_shape(); const TensorShape wei_shape = wei->tensor_shape(); const size_t src_w = src_shape[idx_w]; @@ -150,9 +161,9 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT desc.export_input_to_cl_image = false; desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); - if(depth_multiplier == 1) + if (depth_multiplier == 1) { - if(desc.export_weights_to_cl_image == false) + if (desc.export_weights_to_cl_image == false) { desc.n0 = 8; } @@ -163,11 +174,11 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT } else { - if((depth_multiplier % 4) == 0) + if ((depth_multiplier % 4) == 0) { desc.n0 = 4; } - else if((depth_multiplier % 2) == 0) + else if ((depth_multiplier % 2) == 0) { desc.n0 = 2; } @@ -178,20 +189,21 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT } // Note: If we reduce n0, export to cl_image must be false - ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); desc.n0 = adjust_vec_size(desc.n0, kernel_c); // Set m0 only if stride_x == 1 and dilation_x == 1 - if(conv_info.stride().first == 1 && dilation.x() == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1) { - if((kernel_w >= 9) || (kernel_w == 1)) + if ((kernel_w >= 9) || (kernel_w == 1)) { desc.m0 = 1; } else { - if((src_w % 5) == 0) + if ((src_w % 5) == 0) { desc.m0 = 5; } @@ -210,19 +222,22 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT return desc; } -DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { ARM_COMPUTE_UNUSED(wei); DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { desc.export_input_to_cl_image = false; desc.export_weights_to_cl_image = false; desc.n0 = (depth_multiplier == 1) ? 4 : 1; - if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) { desc.m0 = 2; } @@ -235,15 +250,18 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITe return desc; } -DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { - const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); - const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); const TensorShape wei_shape = wei->tensor_shape(); const size_t kernel_c = wei_shape[idx_c]; const size_t kernel_w = wei_shape[idx_w]; @@ -251,9 +269,9 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT desc.export_input_to_cl_image = false; desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); - if(depth_multiplier == 1) + if (depth_multiplier == 1) { - if(desc.export_weights_to_cl_image == false) + if (desc.export_weights_to_cl_image == false) { desc.n0 = 8; } @@ -264,11 +282,11 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT } else { - if((depth_multiplier % 4) == 0) + if ((depth_multiplier % 4) == 0) { desc.n0 = 4; } - else if((depth_multiplier % 2) == 0) + else if ((depth_multiplier % 2) == 0) { desc.n0 = 2; } @@ -279,14 +297,15 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT } // Note: If we reduce n0, export to cl_image must be false - ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); desc.n0 = adjust_vec_size(desc.n0, kernel_c); // Set m0 only if stride_x == 1 and dilation_x == 1 - if(conv_info.stride().first == 1 && dilation.x() == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1) { - if((kernel_w >= 9) || (kernel_w == 1)) + if ((kernel_w >= 9) || (kernel_w == 1)) { desc.m0 = 1; } diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h index 4d51fa668c..fabce77b54 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h @@ -41,18 +41,33 @@ public: ClDWCNativeDefaultConfigValhall(GPUTarget gpu); // Inherited overridden method - DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) override; + DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) override; private: - DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); }; } // namespace cl_dwc } // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp index 5593c6de61..c8b006c546 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp @@ -32,7 +32,7 @@ namespace cl_dwc bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier) { // Check whether we can use the cl image with the weights. - if(!export_to_cl_image(weights)) + if (!export_to_cl_image(weights)) { return false; } @@ -45,12 +45,12 @@ bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_mul // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons: // 1- When the kernel size is 1x1 // 2- When the depth multiplier is greater than 1 and not multiple of 4. - if((kernel_w == 1) && (kernel_h == 1)) + if ((kernel_w == 1) && (kernel_h == 1)) { return false; } - if((depth_multiplier > 1) && (depth_multiplier % 4) != 0) + if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0) { return false; } @@ -58,4 +58,4 @@ bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_mul return true; } } // namespace cl_dwc -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h index c08053dcb3..49ce6ff479 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h @@ -46,7 +46,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: // The heuristic for Midgard is the same as the one used for Arm Mali-G71 diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h index b5df132a12..614a6622df 100644 --- a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h +++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h @@ -27,6 +27,7 @@ #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" namespace arm_compute @@ -52,8 +53,7 @@ public: * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) * */ - ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) - : _configs{ func_f32, func_f16, func_int8 } + ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} { } @@ -65,7 +65,7 @@ public: */ T get_function(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return _configs.at(DT_F32); @@ -92,8 +92,7 @@ public: * * @param[in] arch GPU target */ - IClDWCNativeKernelConfig(GPUTarget arch) - : _target(arch) + IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch) { } ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig); @@ -107,8 +106,11 @@ public: * @param[in] dilation Kernel dilation * @param[in] depth_multiplier Output feature maps multiplier */ - virtual DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) = 0; + virtual DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) = 0; protected: GPUTarget _target; diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp index 990f050112..3380d8f1b7 100644 --- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp @@ -35,17 +35,19 @@ namespace cl_indirect_conv { using namespace arm_compute::misc::shape_calculator; -ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) - : IClIndirectConvKernelConfig(gpu) +ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu) { } -DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { - using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - ClIndirectConvConfigArray configs_G77(&ClIndirectConvDefaultConfigValhall::configure_G77_f32, - &ClIndirectConvDefaultConfigValhall::configure_G77_f16); + ClIndirectConvConfigArray configs_G77( + &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16); // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned @@ -57,22 +59,24 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const return (this->*func)(src, wei, conv_info); } -DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); - const int32_t stride_x = conv_info.stride().first; - const int32_t stride_y = conv_info.stride().second; - const int32_t ofm = dst_shape[0]; - const int32_t m = (dst_shape[1]/ stride_x) * (dst_shape[2] / stride_y); + const int32_t stride_x = conv_info.stride().first; + const int32_t stride_y = conv_info.stride().second; + const int32_t ofm = dst_shape[0]; + const int32_t m = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y); desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(ofm <= 4) + if (ofm <= 4) { desc.m0 = 1; desc.n0 = 2; @@ -82,7 +86,7 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f3 { // The 16000 threshold value has been identified as the right // one for using the biggest block size allowed on F32: 5x4x4 - if(m < 16000) + if (m < 16000) { desc.m0 = 4; desc.n0 = 4; @@ -100,31 +104,33 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f3 return desc; } -DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { - const TensorShape wei_shape = wei->tensor_shape(); - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); - const int32_t ofm = dst_shape[0]; - const int32_t m = dst_shape[1] * dst_shape[2]; - const int32_t k = wei_shape[0]; + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const int32_t k = wei_shape[0]; desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(ofm <= 4) + if (ofm <= 4) { // k0 should be as larger as possible. However, we should avoid // having left-over for loops that make the implementation slower. - if((k % 16) == 0) + if ((k % 16) == 0) { desc.k0 = 16; } - else if((k % 8) == 0) + else if ((k % 8) == 0) { desc.k0 = 8; } @@ -140,11 +146,11 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f1 { // The 16000 threshold value has been identified as the right // one for using the biggest block size allowed on F16: 8x4 - if(m >= 16000 && k < 4) + if (m >= 16000 && k < 4) { desc.m0 = 8; desc.n0 = 4; - desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4 + desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4 } else { diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h index 68dca91885..bab808c66c 100644 --- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h @@ -41,11 +41,14 @@ public: ClIndirectConvDefaultConfigValhall(GPUTarget gpu); // Inherited overridden method - DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; private: - DirectConvComputeKernelInfo configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); }; } // namespace cl_indirect_conv } // namespace arm_compute diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h index 73fbb87560..dd614e1f68 100644 --- a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h @@ -45,7 +45,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: case GPUTarget::BIFROST: diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h index d2f4cde662..d05da18b58 100644 --- a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h +++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h @@ -27,6 +27,7 @@ #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" namespace arm_compute @@ -49,8 +50,7 @@ public: * @param[in] func_f16 Function to call for indirect convolution F16 * */ - ClIndirectConvConfigArray(T func_f32, T func_f16) - : _configs{ func_f32, func_f16} + ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16} { } @@ -62,7 +62,7 @@ public: */ T get_function(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return _configs.at(DT_F32); @@ -85,8 +85,7 @@ public: * * @param[in] arch GPU target */ - IClIndirectConvKernelConfig(GPUTarget arch) - : _target(arch) + IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch) { } ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig); @@ -98,7 +97,8 @@ public: * @param[in] wei Weights tensor * @param[in] conv_info Convolution info */ - virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; + virtual DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; protected: GPUTarget _target; diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp index 01102b3d60..b3c8d891dc 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp @@ -28,30 +28,33 @@ #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/TensorInfo.h" -#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" -#include +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h" +#include + namespace arm_compute { namespace cl_matmul { -ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) - : IClMatMulNativeKernelConfig(gpu) +ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu) { } -MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) +MatMulKernelInfo +ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) { - using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo & info); + using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); - ClMatMulNativeConfigArray configs_G710(&ClMatMulNativeDefaultConfigValhall::configure_G710_f32, - &ClMatMulNativeDefaultConfigValhall::configure_G710_f16, - &ClMatMulNativeDefaultConfigValhall::configure_G710_u8); + ClMatMulNativeConfigArray configs_G710( + &ClMatMulNativeDefaultConfigValhall::configure_G710_f32, + &ClMatMulNativeDefaultConfigValhall::configure_G710_f16, + &ClMatMulNativeDefaultConfigValhall::configure_G710_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G710: default: @@ -67,7 +70,7 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo const bool is_batched = lhs_shape.num_dimensions() > 2; - if(is_batched == true) + if (is_batched == true) { lhs_shape.collapse_from(2); } @@ -81,103 +84,48 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo return (this->*func)(m, n, k, b, rhs->lock_paddings(), info); } -MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) { - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = - { - { 3136, 64, 64, 36, 4, 4, 16, 1 }, - { 4096, 48, 32, 36, 4, 4, 4, 1 }, - { 688, 92, 68, 32, 2, 8, 4, 1 }, - { 24, 464, 412, 24, 2, 8, 4, 1 }, - { 112, 184, 144, 28, 4, 4, 16, 1 }, - { 5776, 64, 32, 36, 2, 4, 16, 1 }, - { 1568, 64, 40, 36, 2, 8, 8, 1 }, - { 2920, 64, 64, 24, 4, 4, 16, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = - { - { 3136, 64, 64, 36, 4, 4, 8, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 0 }, - { 688, 92, 68, 32, 5, 4, 4, 0 }, - { 24, 464, 412, 24, 6, 2, 8, 0 }, - { 112, 184, 144, 28, 6, 4, 4, 0 }, - { 5776, 64, 32, 36, 5, 4, 4, 0 }, - { 1568, 64, 40, 36, 4, 4, 8, 0 }, - { 2920, 64, 64, 24, 4, 4, 8, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = - { - { 3136, 64, 64, 36, 4, 4, 4, 1 }, - { 4096, 48, 32, 36, 2, 2, 16, 1 }, - { 688, 92, 68, 32, 4, 4, 4, 1 }, - { 24, 464, 412, 24, 6, 2, 8, 1 }, - { 112, 184, 144, 28, 4, 2, 16, 1 }, - { 5776, 64, 32, 36, 4, 4, 4, 1 }, - { 1568, 64, 40, 36, 4, 4, 8, 1 }, - { 2920, 64, 64, 24, 4, 4, 4, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = - { - { 3136, 64, 64, 36, 5, 4, 4, 0 }, - { 4096, 48, 32, 36, 5, 4, 4, 0 }, - { 688, 92, 68, 32, 5, 4, 4, 0 }, - { 24, 464, 412, 24, 6, 2, 4, 0 }, - { 112, 184, 144, 28, 5, 4, 4, 0 }, - { 5776, 64, 32, 36, 5, 4, 4, 0 }, - { 1568, 64, 40, 36, 5, 4, 4, 0 }, - { 2920, 64, 64, 24, 6, 2, 4, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = - { - { 3136, 64, 64, 36, 4, 4, 16, 1 }, - { 4096, 48, 32, 36, 4, 4, 4, 1 }, - { 688, 92, 68, 32, 2, 8, 4, 1 }, - { 24, 464, 412, 24, 2, 8, 4, 1 }, - { 112, 184, 144, 28, 4, 4, 16, 1 }, - { 5776, 64, 32, 36, 2, 8, 8, 1 }, - { 1568, 64, 40, 36, 4, 4, 8, 1 }, - { 2920, 64, 64, 24, 4, 4, 16, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = - { - { 3136, 64, 64, 36, 4, 4, 4, 0 }, - { 4096, 48, 32, 36, 4, 4, 4, 0 }, - { 688, 92, 68, 32, 4, 4, 4, 0 }, - { 24, 464, 412, 24, 4, 4, 4, 0 }, - { 112, 184, 144, 28, 4, 4, 4, 0 }, - { 5776, 64, 32, 36, 4, 4, 8, 0 }, - { 1568, 64, 40, 36, 4, 4, 4, 0 }, - { 2920, 64, 64, 24, 4, 4, 4, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = - { - { 3136, 64, 64, 36, 4, 4, 4, 1 }, - { 4096, 48, 32, 36, 4, 4, 4, 1 }, - { 688, 92, 68, 32, 4, 4, 4, 1 }, - { 24, 464, 412, 24, 2, 2, 16, 1 }, - { 112, 184, 144, 28, 4, 4, 4, 1 }, - { 5776, 64, 32, 36, 4, 4, 4, 1 }, - { 1568, 64, 40, 36, 4, 4, 4, 1 }, - { 2920, 64, 64, 24, 4, 4, 4, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = - { - { 3136, 64, 64, 36, 4, 4, 4, 0 }, - { 4096, 48, 32, 36, 4, 4, 4, 0 }, - { 688, 92, 68, 32, 4, 4, 4, 0 }, - { 24, 464, 412, 24, 4, 2, 8, 0 }, - { 112, 184, 144, 28, 4, 4, 4, 0 }, - { 5776, 64, 32, 36, 4, 4, 4, 0 }, - { 1568, 64, 40, 36, 4, 4, 4, 0 }, - { 2920, 64, 64, 24, 4, 4, 4, 0 } - }; + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1}, + {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1}, + {1568, 64, 40, 36, 2, 8, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 5, 4, 4, 0}, + {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = { + {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0}, {688, 92, 68, 32, 5, 4, 4, 0}, + {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0}, + {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1}, + {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; const bool adj_lhs = info.adj_lhs(); const bool adj_rhs = info.adj_rhs(); @@ -185,17 +133,17 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr; const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr; - if((adj_lhs == false) && (adj_rhs == false)) + if ((adj_lhs == false) && (adj_rhs == false)) { configs_best_to_use = &configs_mnkb_best_nt_nt; configs_fallback_to_use = &configs_mnkb_fallback_nt_nt; } - else if((adj_lhs == false) && (adj_rhs == true)) + else if ((adj_lhs == false) && (adj_rhs == true)) { configs_best_to_use = &configs_mnkb_best_nt_t; configs_fallback_to_use = &configs_mnkb_fallback_nt_t; } - else if((adj_lhs == true) && (adj_rhs == false)) + else if ((adj_lhs == true) && (adj_rhs == false)) { configs_best_to_use = &configs_mnkb_best_t_nt; configs_fallback_to_use = &configs_mnkb_fallback_t_nt; @@ -209,108 +157,51 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b); MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b); - return select_info(desc0, - desc1, - m, n, k, b, DataType::F32, rhs_lock_padding); + return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding); } -MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) { - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = - { - { 3136, 64, 64, 36, 4, 4, 16, 1 }, - { 4096, 48, 32, 36, 4, 4, 8, 1 }, - { 688, 92, 68, 32, 4, 4, 16, 1 }, - { 24, 464, 412, 24, 4, 4, 4, 1 }, - { 112, 184, 144, 28, 4, 4, 16, 1 }, - { 5776, 64, 32, 36, 4, 4, 8, 1 }, - { 1568, 64, 40, 36, 4, 4, 8, 1 }, - { 2920, 64, 64, 24, 4, 4, 16, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = - { - { 3136, 64, 64, 36, 6, 4, 8, 0 }, - { 4096, 48, 32, 36, 6, 4, 8, 0 }, - { 688, 92, 68, 32, 6, 4, 8, 0 }, - { 24, 464, 412, 24, 4, 4, 8, 0 }, - { 112, 184, 144, 28, 6, 4, 8, 0 }, - { 5776, 64, 32, 36, 6, 4, 8, 0 }, - { 1568, 64, 40, 36, 6, 4, 8, 0 }, - { 2920, 64, 64, 24, 6, 4, 8, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = - { - { 3136, 64, 64, 36, 6, 4, 8, 1 }, - { 4096, 48, 32, 36, 6, 4, 8, 1 }, - { 688, 92, 68, 32, 4, 4, 4, 1 }, - { 24, 464, 412, 24, 6, 2, 4, 1 }, - { 112, 184, 144, 28, 4, 2, 16, 1 }, - { 5776, 64, 32, 36, 6, 4, 8, 1 }, - { 1568, 64, 40, 36, 6, 4, 8, 1 }, - { 2920, 64, 64, 24, 6, 4, 8, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = - { - { 3136, 64, 64, 36, 6, 2, 16, 0 }, - { 4096, 48, 32, 36, 5, 4, 8, 0 }, - { 688, 92, 68, 32, 6, 2, 16, 0 }, - { 24, 464, 412, 24, 6, 2, 16, 0 }, - { 112, 184, 144, 28, 6, 2, 16, 0 }, - { 5776, 64, 32, 36, 5, 4, 8, 0 }, - { 1568, 64, 40, 36, 5, 4, 8, 0 }, - { 2920, 64, 64, 24, 6, 2, 16, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = - { - { 3136, 64, 64, 36, 4, 4, 16, 1 }, - { 4096, 48, 32, 36, 4, 4, 4, 1 }, - { 688, 92, 68, 32, 4, 4, 4, 1 }, - { 24, 464, 412, 24, 4, 4, 4, 1 }, - { 112, 184, 144, 28, 4, 4, 4, 1 }, - { 5776, 64, 32, 36, 4, 4, 4, 1 }, - { 1568, 64, 40, 36, 4, 4, 4, 1 }, - { 2920, 64, 64, 24, 4, 4, 4, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = - { - { 3136, 64, 64, 36, 4, 4, 4, 0 }, - { 4096, 48, 32, 36, 4, 4, 4, 0 }, - { 688, 92, 68, 32, 4, 4, 4, 0 }, - { 24, 464, 412, 24, 4, 4, 4, 0 }, - { 112, 184, 144, 28, 4, 4, 4, 0 }, - { 5776, 64, 32, 36, 4, 4, 4, 0 }, - { 1568, 64, 40, 36, 4, 4, 4, 0 }, - { 2920, 64, 64, 24, 4, 4, 4, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = - { - { 3136, 64, 64, 36, 4, 4, 16, 1 }, - { 4096, 48, 32, 36, 4, 4, 8, 1 }, - { 688, 92, 68, 32, 4, 4, 4, 1 }, - { 24, 464, 412, 24, 4, 2, 8, 1 }, - { 112, 184, 144, 28, 4, 2, 16, 1 }, - { 5776, 64, 32, 36, 4, 4, 16, 1 }, - { 1568, 64, 40, 36, 4, 4, 8, 1 }, - { 2920, 64, 64, 24, 4, 4, 16, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = - { - { 3136, 64, 64, 36, 4, 4, 8, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 0 }, - { 688, 92, 68, 32, 4, 4, 8, 0 }, - { 24, 464, 412, 24, 4, 4, 8, 0 }, - { 112, 184, 144, 28, 4, 4, 8, 0 }, - { 5776, 64, 32, 36, 4, 4, 8, 0 }, - { 1568, 64, 40, 36, 4, 4, 8, 0 }, - { 2920, 64, 64, 24, 4, 4, 8, 0 } - }; + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 16, 1}, + {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = { + {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0}, {688, 92, 68, 32, 6, 4, 8, 0}, + {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0}, + {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1}, + {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = { + {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0}, {688, 92, 68, 32, 6, 2, 16, 0}, + {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0}, + {1568, 64, 40, 36, 5, 4, 8, 0}, {2920, 64, 64, 24, 6, 2, 16, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 4, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 8, 0}, + {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; const bool adj_lhs = info.adj_lhs(); const bool adj_rhs = info.adj_rhs(); @@ -318,17 +209,17 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr; const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr; - if((adj_lhs == false) && (adj_rhs == false)) + if ((adj_lhs == false) && (adj_rhs == false)) { configs_best_to_use = &configs_mnkb_best_nt_nt; configs_fallback_to_use = &configs_mnkb_fallback_nt_nt; } - else if((adj_lhs == false) && (adj_rhs == true)) + else if ((adj_lhs == false) && (adj_rhs == true)) { configs_best_to_use = &configs_mnkb_best_nt_t; configs_fallback_to_use = &configs_mnkb_fallback_nt_t; } - else if((adj_lhs == true) && (adj_rhs == false)) + else if ((adj_lhs == true) && (adj_rhs == false)) { configs_best_to_use = &configs_mnkb_best_t_nt; configs_fallback_to_use = &configs_mnkb_fallback_t_nt; @@ -342,75 +233,46 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b); MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b); - return select_info(desc0, - desc1, - m, n, k, b, DataType::F16, rhs_lock_padding); + return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding); } -MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) { ARM_COMPUTE_UNUSED(rhs_lock_padding); - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = - { - { 3136, 64, 64, 36, 6, 4, 4, 0 }, - { 4096, 48, 32, 36, 6, 4, 4, 0 }, - { 688, 92, 68, 32, 2, 8, 4, 0 }, - { 24, 464, 412, 24, 4, 4, 4, 0 }, - { 112, 184, 144, 28, 6, 4, 4, 0 }, - { 5776, 64, 32, 36, 6, 4, 4, 0 }, - { 1568, 64, 40, 36, 6, 4, 4, 0 }, - { 2920, 64, 64, 24, 5, 4, 4, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = - { - { 3136, 64, 64, 36, 4, 4, 16, 0 }, - { 4096, 48, 32, 36, 4, 4, 16, 0 }, - { 688, 92, 68, 32, 4, 4, 16, 0 }, - { 24, 464, 412, 24, 6, 2, 16, 0 }, - { 112, 184, 144, 28, 4, 4, 16, 0 }, - { 5776, 64, 32, 36, 4, 4, 16, 0 }, - { 1568, 64, 40, 36, 6, 4, 4, 0 }, - { 2920, 64, 64, 24, 4, 4, 16, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = - { - { 3136, 64, 64, 36, 4, 4, 8, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 0 }, - { 688, 92, 68, 32, 4, 4, 4, 0 }, - { 24, 464, 412, 24, 4, 4, 4, 0 }, - { 112, 184, 144, 28, 4, 4, 8, 0 }, - { 5776, 64, 32, 36, 4, 4, 8, 0 }, - { 1568, 64, 40, 36, 4, 4, 8, 0 }, - { 2920, 64, 64, 24, 4, 4, 8, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = - { - { 3136, 64, 64, 36, 4, 2, 16, 0 }, - { 4096, 48, 32, 36, 4, 4, 4, 0 }, - { 688, 92, 68, 32, 4, 4, 8, 0 }, - { 24, 464, 412, 24, 4, 2, 16, 0 }, - { 112, 184, 144, 28, 4, 2, 16, 0 }, - { 5776, 64, 32, 36, 4, 4, 4, 0 }, - { 1568, 64, 40, 36, 4, 4, 8, 0 }, - { 2920, 64, 64, 24, 4, 2, 16, 0 } - }; + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0}, {688, 92, 68, 32, 2, 8, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0}, + {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0}, {688, 92, 68, 32, 4, 4, 16, 0}, + {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0}, + {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 16, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 8, 0}, + {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 2, 16, 0}}; const bool adj_lhs = info.adj_lhs(); const bool adj_rhs = info.adj_rhs(); - if((adj_lhs == false) && (adj_rhs == false)) + if ((adj_lhs == false) && (adj_rhs == false)) { return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b); } - else if((adj_lhs == false) && (adj_rhs == true)) + else if ((adj_lhs == false) && (adj_rhs == true)) { return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b); } - else if((adj_lhs == true) && (adj_rhs == false)) + else if ((adj_lhs == true) && (adj_rhs == false)) { return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b); } @@ -419,5 +281,5 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(unsigned return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b); } } -} // namespace opencl +} // namespace cl_matmul } // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h index fe167d18dd..6b39db6a3f 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h @@ -44,10 +44,13 @@ public: MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override; private: - MatMulKernelInfo configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); - MatMulKernelInfo configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); - MatMulKernelInfo configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); }; -} // namespace opencl +} // namespace cl_matmul } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp index 1e06e84d4d..89cad30214 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" + #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include @@ -37,22 +38,32 @@ namespace cl_matmul { MatMulKernelInfo select_info(const MatMulKernelInfo &info0, const MatMulKernelInfo &info1, - unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding) + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type, + bool rhs_lock_padding) { - ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true, "The fallback MatMul configuration cannot have export_to_cl_image = true"); - ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs, "The MatMul configurations must have the same adj_lhs value"); - ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs, "The MatMul configurations must have the same adj_rhs value"); + ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true, + "The fallback MatMul configuration cannot have export_to_cl_image = true"); + ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs, + "The MatMul configurations must have the same adj_lhs value"); + ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs, + "The MatMul configurations must have the same adj_rhs value"); const bool adj_lhs = info0.adj_lhs; const bool adj_rhs = info0.adj_rhs; - TensorInfo lhs_info = !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type); - TensorInfo rhs_info = !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type); + TensorInfo lhs_info = + !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type); + TensorInfo rhs_info = + !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type); TensorInfo dst_info; - if(rhs_lock_padding == false) + if (rhs_lock_padding == false) { - if(bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0))) + if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0))) { return info0; } @@ -67,7 +78,13 @@ MatMulKernelInfo select_info(const MatMulKernelInfo &info0, } } -MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b) +MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, + bool adj_lhs, + bool adj_rhs, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b) { size_t min_acc = std::numeric_limits::max(); size_t min_idx = 0; @@ -76,12 +93,13 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lh const size_t num_rows = configs.size(); const size_t num_cols = configs[0].size(); - ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U, "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS"); + ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U, + "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS"); ARM_COMPUTE_UNUSED(num_cols); // Find nearest GeMM workload // Note: the workload does not depend on the K dimension - for(size_t y = 0; y < num_rows; ++y) + for (size_t y = 0; y < num_rows; ++y) { size_t mc0 = static_cast(configs[y][0]); size_t nc0 = static_cast(configs[y][1]); @@ -94,7 +112,7 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lh acc += (k - kc0) * (k - kc0); acc += (b - bc0) * (b - bc0); acc = std::sqrt(acc); - if(acc < min_acc) + if (acc < min_acc) { min_acc = acc; min_idx = y; diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h index 3881617558..a114fffa68 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h @@ -52,7 +52,12 @@ using MatMulNativeConfigsMatrix = std::vector>; */ MatMulKernelInfo select_info(const MatMulKernelInfo &info0, const MatMulKernelInfo &info1, - unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding); + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type, + bool rhs_lock_padding); /** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user * @@ -66,7 +71,13 @@ MatMulKernelInfo select_info(const MatMulKernelInfo &info0, * * @return @ref MatMulKernelInfo */ -MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b); +MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, + bool adj_lhs, + bool adj_rhs, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b); } // namespace cl_matmul } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS */ diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h index a2dbfc7dd5..b10018a6d2 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h @@ -45,7 +45,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: case GPUTarget::BIFROST: @@ -56,6 +56,6 @@ public: } } }; -} // namespace opencl +} // namespace cl_matmul } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG */ diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h index 4f548bd01d..b9b091100c 100644 --- a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h +++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h @@ -28,6 +28,7 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/function_info/MatMulInfo.h" + #include "src/core/common/Macros.h" namespace arm_compute @@ -53,8 +54,7 @@ public: * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) * */ - ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) - : _configs{ func_f32, func_f16, func_int8 } + ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} { } @@ -66,7 +66,7 @@ public: */ T get_function(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return _configs.at(DT_F32); @@ -93,8 +93,7 @@ public: * * @param[in] arch GPU target */ - IClMatMulNativeKernelConfig(GPUTarget arch) - : _target(arch) + IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch) { } ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig); -- cgit v1.2.1