aboutsummaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorFelix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>2023-09-27 17:46:17 +0100
committerfelixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>2023-09-28 12:08:05 +0000
commitafd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch)
tree03bc7d5a762099989b16a656fa8d397b490ed70e /src/core
parentbdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff)
downloadComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz
Apply clang-format on repository
Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/core')
-rw-r--r--src/core/AccessWindowAutoPadding.cpp16
-rw-r--r--src/core/AccessWindowAutoPadding.h9
-rw-r--r--src/core/AccessWindowStatic.cpp45
-rw-r--r--src/core/AccessWindowStatic.h9
-rw-r--r--src/core/AccessWindowTranspose.cpp54
-rw-r--r--src/core/AccessWindowTranspose.h5
-rw-r--r--src/core/CL/CLCommandBuffer.cpp2
-rw-r--r--src/core/CL/CLCommandBuffer.h5
-rw-r--r--src/core/CL/CLCompatCommandBuffer.cpp32
-rw-r--r--src/core/CL/CLCompatCommandBuffer.h5
-rw-r--r--src/core/CL/CLCompileContext.cpp91
-rw-r--r--src/core/CL/CLHelpers.cpp93
-rw-r--r--src/core/CL/CLKernelLibrary.cpp14
-rw-r--r--src/core/CL/CLMutableCommandBuffer.cpp36
-rw-r--r--src/core/CL/CLMutableCommandBuffer.h5
-rw-r--r--src/core/CL/CLUtils.cpp35
-rw-r--r--src/core/CL/CLUtils.h7
-rw-r--r--src/core/CL/CLValidate.h18
-rw-r--r--src/core/CL/DefaultLWSHeuristics.cpp14
-rw-r--r--src/core/CL/ICLKernel.cpp32
-rw-r--r--src/core/CL/ICLKernel.h60
-rw-r--r--src/core/CL/ICLSimple2DKernel.cpp3
-rw-r--r--src/core/CL/ICLSimple2DKernel.h2
-rw-r--r--src/core/CL/ICLSimple3DKernel.cpp3
-rw-r--r--src/core/CL/ICLSimple3DKernel.h2
-rw-r--r--src/core/CL/ICLSimpleKernel.cpp17
-rw-r--r--src/core/CL/ICLSimpleKernel.h9
-rw-r--r--src/core/CL/ICLTensor.cpp3
-rw-r--r--src/core/CL/OpenCL.cpp549
-rw-r--r--src/core/CL/cl_kernels/activation_float_helpers.h13
-rw-r--r--src/core/CL/cl_kernels/activation_quant_helpers.h15
-rw-r--r--src/core/CL/cl_kernels/gemm_helpers.h252
-rw-r--r--src/core/CL/cl_kernels/helpers.h817
-rw-r--r--src/core/CL/cl_kernels/helpers_asymm.h337
-rw-r--r--src/core/CL/cl_kernels/load_store_utility.h73
-rw-r--r--src/core/CL/cl_kernels/repeat.h42
-rw-r--r--src/core/CL/cl_kernels/warp_helpers.h59
-rw-r--r--src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp80
-rw-r--r--src/core/CL/kernels/CLArgMinMaxLayerKernel.h10
-rw-r--r--src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp125
-rw-r--r--src/core/CL/kernels/CLBatchNormalizationLayerKernel.h32
-rw-r--r--src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp76
-rw-r--r--src/core/CL/kernels/CLBatchToSpaceLayerKernel.h25
-rw-r--r--src/core/CL/kernels/CLBitwiseKernel.cpp25
-rw-r--r--src/core/CL/kernels/CLBitwiseKernel.h6
-rw-r--r--src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp42
-rw-r--r--src/core/CL/kernels/CLBoundingBoxTransformKernel.h16
-rw-r--r--src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp64
-rw-r--r--src/core/CL/kernels/CLChannelShuffleLayerKernel.h5
-rw-r--r--src/core/CL/kernels/CLComparisonKernel.cpp75
-rw-r--r--src/core/CL/kernels/CLComparisonKernel.h14
-rw-r--r--src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp25
-rw-r--r--src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h5
-rw-r--r--src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp84
-rw-r--r--src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h23
-rw-r--r--src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp29
-rw-r--r--src/core/CL/kernels/CLDepthToSpaceLayerKernel.h4
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp199
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h45
-rw-r--r--src/core/CL/kernels/CLFFTDigitReverseKernel.cpp42
-rw-r--r--src/core/CL/kernels/CLFFTDigitReverseKernel.h18
-rw-r--r--src/core/CL/kernels/CLFFTRadixStageKernel.cpp46
-rw-r--r--src/core/CL/kernels/CLFFTRadixStageKernel.h9
-rw-r--r--src/core/CL/kernels/CLFFTScaleKernel.cpp26
-rw-r--r--src/core/CL/kernels/CLFFTScaleKernel.h9
-rw-r--r--src/core/CL/kernels/CLFillBorderKernel.cpp59
-rw-r--r--src/core/CL/kernels/CLFillBorderKernel.h18
-rw-r--r--src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp129
-rw-r--r--src/core/CL/kernels/CLFuseBatchNormalizationKernel.h41
-rw-r--r--src/core/CL/kernels/CLGatherKernel.cpp36
-rw-r--r--src/core/CL/kernels/CLGatherKernel.h10
-rw-r--r--src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp33
-rw-r--r--src/core/CL/kernels/CLGenerateProposalsLayerKernel.h7
-rw-r--r--src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp54
-rw-r--r--src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h16
-rw-r--r--src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp45
-rw-r--r--src/core/CL/kernels/CLL2NormalizeLayerKernel.h11
-rw-r--r--src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp45
-rw-r--r--src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h11
-rw-r--r--src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp19
-rw-r--r--src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h5
-rw-r--r--src/core/CL/kernels/CLNormalizationLayerKernel.cpp72
-rw-r--r--src/core/CL/kernels/CLNormalizationLayerKernel.h7
-rw-r--r--src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp56
-rw-r--r--src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h9
-rw-r--r--src/core/CL/kernels/CLPadLayerKernel.cpp95
-rw-r--r--src/core/CL/kernels/CLPadLayerKernel.h20
-rw-r--r--src/core/CL/kernels/CLPriorBoxLayerKernel.cpp83
-rw-r--r--src/core/CL/kernels/CLPriorBoxLayerKernel.h27
-rw-r--r--src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp47
-rw-r--r--src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h9
-rw-r--r--src/core/CL/kernels/CLROIAlignLayerKernel.cpp51
-rw-r--r--src/core/CL/kernels/CLROIAlignLayerKernel.h14
-rw-r--r--src/core/CL/kernels/CLROIPoolingLayerKernel.cpp38
-rw-r--r--src/core/CL/kernels/CLROIPoolingLayerKernel.h14
-rw-r--r--src/core/CL/kernels/CLRangeKernel.cpp38
-rw-r--r--src/core/CL/kernels/CLRangeKernel.h1
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.cpp103
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.h10
-rw-r--r--src/core/CL/kernels/CLReorgLayerKernel.cpp41
-rw-r--r--src/core/CL/kernels/CLReorgLayerKernel.h1
-rw-r--r--src/core/CL/kernels/CLReverseKernel.cpp16
-rw-r--r--src/core/CL/kernels/CLReverseKernel.h5
-rw-r--r--src/core/CL/kernels/CLSelectKernel.cpp33
-rw-r--r--src/core/CL/kernels/CLSelectKernel.h7
-rw-r--r--src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp100
-rw-r--r--src/core/CL/kernels/CLSpaceToBatchLayerKernel.h35
-rw-r--r--src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp23
-rw-r--r--src/core/CL/kernels/CLSpaceToDepthLayerKernel.h4
-rw-r--r--src/core/CL/kernels/CLStackLayerKernel.cpp38
-rw-r--r--src/core/CL/kernels/CLStackLayerKernel.h17
-rw-r--r--src/core/CL/kernels/CLStridedSliceKernel.cpp101
-rw-r--r--src/core/CL/kernels/CLStridedSliceKernel.h24
-rw-r--r--src/core/CL/kernels/CLTileKernel.cpp30
-rw-r--r--src/core/CL/kernels/CLTileKernel.h5
-rw-r--r--src/core/CPP/CPPTypes.cpp4
-rw-r--r--src/core/CPP/Validate.h26
-rw-r--r--src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp171
-rw-r--r--src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp110
-rw-r--r--src/core/CPP/kernels/CPPPermuteKernel.cpp45
-rw-r--r--src/core/CPP/kernels/CPPTopKVKernel.cpp43
-rw-r--r--src/core/CPP/kernels/CPPUpsampleKernel.cpp17
-rw-r--r--src/core/Error.cpp5
-rw-r--r--src/core/GPUTarget.cpp97
-rw-r--r--src/core/Helpers.cpp27
-rw-r--r--src/core/IAccessWindow.cpp79
-rw-r--r--src/core/IKernel.cpp3
-rw-r--r--src/core/ITensor.cpp34
-rw-r--r--src/core/ITensorPack.cpp9
-rw-r--r--src/core/NEON/NEAsymm.h308
-rw-r--r--src/core/NEON/NEAsymm.inl10
-rw-r--r--src/core/NEON/NEFixedPoint.inl8
-rw-r--r--src/core/NEON/NEMath.inl105
-rw-r--r--src/core/NEON/NESymm.h95
-rw-r--r--src/core/NEON/SVEAsymm.h47
-rw-r--r--src/core/NEON/SVEMath.h8
-rw-r--r--src/core/NEON/SVEMath.inl70
-rw-r--r--src/core/NEON/SVESymm.h23
-rw-r--r--src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp302
-rw-r--r--src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h21
-rw-r--r--src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp127
-rw-r--r--src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h13
-rw-r--r--src/core/NEON/kernels/NEBitwiseAndKernel.cpp17
-rw-r--r--src/core/NEON/kernels/NEBitwiseNotKernel.cpp14
-rw-r--r--src/core/NEON/kernels/NEBitwiseOrKernel.cpp18
-rw-r--r--src/core/NEON/kernels/NEBitwiseXorKernel.cpp18
-rw-r--r--src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp68
-rw-r--r--src/core/NEON/kernels/NEBoundingBoxTransformKernel.h8
-rw-r--r--src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp97
-rw-r--r--src/core/NEON/kernels/NECol2ImKernel.h4
-rw-r--r--src/core/NEON/kernels/NECropKernel.cpp238
-rw-r--r--src/core/NEON/kernels/NECropKernel.h19
-rw-r--r--src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp76
-rw-r--r--src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp149
-rw-r--r--src/core/NEON/kernels/NEFFTDigitReverseKernel.h6
-rw-r--r--src/core/NEON/kernels/NEFFTRadixStageKernel.cpp594
-rw-r--r--src/core/NEON/kernels/NEFFTRadixStageKernel.h14
-rw-r--r--src/core/NEON/kernels/NEFFTScaleKernel.cpp21
-rw-r--r--src/core/NEON/kernels/NEFFTScaleKernel.h4
-rw-r--r--src/core/NEON/kernels/NEFillBorderKernel.cpp225
-rw-r--r--src/core/NEON/kernels/NEFillBorderKernel.h11
-rw-r--r--src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp244
-rw-r--r--src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h39
-rw-r--r--src/core/NEON/kernels/NEGatherKernel.cpp80
-rw-r--r--src/core/NEON/kernels/NEGatherKernel.h5
-rw-r--r--src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp48
-rw-r--r--src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h2
-rw-r--r--src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp57
-rw-r--r--src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h8
-rw-r--r--src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp59
-rw-r--r--src/core/NEON/kernels/NEL2NormalizeLayerKernel.h3
-rw-r--r--src/core/NEON/kernels/NELogicalKernel.cpp91
-rw-r--r--src/core/NEON/kernels/NELogicalKernel.h5
-rw-r--r--src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp54
-rw-r--r--src/core/NEON/kernels/NENormalizationLayerKernel.cpp144
-rw-r--r--src/core/NEON/kernels/NENormalizationLayerKernel.h8
-rw-r--r--src/core/NEON/kernels/NEPadLayerKernel.cpp106
-rw-r--r--src/core/NEON/kernels/NEPadLayerKernel.h13
-rw-r--r--src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp166
-rw-r--r--src/core/NEON/kernels/NEPriorBoxLayerKernel.h14
-rw-r--r--src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp118
-rw-r--r--src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h33
-rw-r--r--src/core/NEON/kernels/NEROIAlignLayerKernel.cpp79
-rw-r--r--src/core/NEON/kernels/NEROIAlignLayerKernel.h5
-rw-r--r--src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp85
-rw-r--r--src/core/NEON/kernels/NEROIPoolingLayerKernel.h8
-rw-r--r--src/core/NEON/kernels/NERangeKernel.cpp90
-rw-r--r--src/core/NEON/kernels/NERangeKernel.h1
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.cpp1955
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.h3
-rw-r--r--src/core/NEON/kernels/NEReorderKernel.cpp70
-rw-r--r--src/core/NEON/kernels/NEReorderKernel.h33
-rw-r--r--src/core/NEON/kernels/NEReorgLayerKernel.cpp56
-rw-r--r--src/core/NEON/kernels/NEReverseKernel.cpp98
-rw-r--r--src/core/NEON/kernels/NEReverseKernel.h3
-rw-r--r--src/core/NEON/kernels/NESelectKernel.cpp156
-rw-r--r--src/core/NEON/kernels/NESelectKernel.h2
-rw-r--r--src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp161
-rw-r--r--src/core/NEON/kernels/NESpaceToBatchLayerKernel.h20
-rw-r--r--src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp59
-rw-r--r--src/core/NEON/kernels/NESpaceToDepthLayerKernel.h1
-rw-r--r--src/core/NEON/kernels/NEStackLayerKernel.cpp55
-rw-r--r--src/core/NEON/kernels/NEStackLayerKernel.h10
-rw-r--r--src/core/NEON/kernels/NEStridedSliceKernel.cpp115
-rw-r--r--src/core/NEON/kernels/NEStridedSliceKernel.h23
-rw-r--r--src/core/NEON/kernels/NETileKernel.cpp47
-rw-r--r--src/core/NEON/kernels/assembly/depthwise.hpp270
-rw-r--r--src/core/NEON/kernels/assembly/depthwise_common.hpp106
-rw-r--r--src/core/NEON/kernels/assembly/pool_common.hpp71
-rw-r--r--src/core/NEON/kernels/assembly/pooling.hpp210
-rw-r--r--src/core/NEON/kernels/assembly/premultiply.hpp17
-rw-r--r--src/core/NEON/kernels/assembly/winograd.hpp181
-rw-r--r--src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp166
-rw-r--r--src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp166
-rw-r--r--src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp115
-rw-r--r--src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp115
-rw-r--r--src/core/NEON/kernels/batchnormalization/impl/list.h6
-rw-r--r--src/core/NEON/kernels/detail/NEActivationFunctionDetail.h7
-rw-r--r--src/core/NEON/kernels/detail/NEColorConvertHelper.inl735
-rw-r--r--src/core/NEON/kernels/detail/NEDirectConvolution3x3.h80
-rw-r--r--src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h507
-rw-r--r--src/core/NEON/wrapper/intrinsics/cvt.h47
-rw-r--r--src/core/NEON/wrapper/intrinsics/div.h1
-rw-r--r--src/core/NEON/wrapper/intrinsics/erf.h1
-rw-r--r--src/core/NEON/wrapper/intrinsics/exp.h1
-rw-r--r--src/core/NEON/wrapper/intrinsics/getlane.h14
-rw-r--r--src/core/NEON/wrapper/intrinsics/inv.h1
-rw-r--r--src/core/NEON/wrapper/intrinsics/invsqrt.h1
-rw-r--r--src/core/NEON/wrapper/intrinsics/log.h1
-rw-r--r--src/core/NEON/wrapper/intrinsics/pow.h1
-rw-r--r--src/core/NEON/wrapper/intrinsics/qmov.h6
-rw-r--r--src/core/NEON/wrapper/intrinsics/reinterpret.h2
-rw-r--r--src/core/NEON/wrapper/intrinsics/round.h1
-rw-r--r--src/core/NEON/wrapper/intrinsics/setlane.h12
-rw-r--r--src/core/NEON/wrapper/intrinsics/shr.h4
-rw-r--r--src/core/NEON/wrapper/intrinsics/sin.h3
-rw-r--r--src/core/NEON/wrapper/intrinsics/svcnt.h4
-rw-r--r--src/core/NEON/wrapper/intrinsics/svcvt.h35
-rw-r--r--src/core/NEON/wrapper/intrinsics/svexp.h3
-rw-r--r--src/core/NEON/wrapper/intrinsics/svlog.h3
-rw-r--r--src/core/NEON/wrapper/intrinsics/svptrue.h4
-rw-r--r--src/core/NEON/wrapper/intrinsics/svwhilelt.h4
-rw-r--r--src/core/NEON/wrapper/intrinsics/tanh.h1
-rw-r--r--src/core/NEON/wrapper/scalar/add.h12
-rw-r--r--src/core/NEON/wrapper/scalar/sub.h12
-rw-r--r--src/core/NEON/wrapper/svtraits.h1
-rw-r--r--src/core/Rounding.cpp7
-rw-r--r--src/core/Size2D.cpp3
-rw-r--r--src/core/Size3D.cpp6
-rw-r--r--src/core/SubTensorInfo.cpp38
-rw-r--r--src/core/TensorInfo.cpp99
-rw-r--r--src/core/Utils.cpp260
-rw-r--r--src/core/Validate.cpp115
-rw-r--r--src/core/common/Macros.h4
-rw-r--r--src/core/common/Registrars.h12
-rw-r--r--src/core/helpers/AutoConfiguration.h21
-rw-r--r--src/core/helpers/MemoryHelpers.h61
-rw-r--r--src/core/helpers/PoolingHelpers.h101
-rw-r--r--src/core/helpers/ScaleHelpers.h23
-rw-r--r--src/core/helpers/SoftmaxHelpers.cpp2
-rw-r--r--src/core/helpers/Utils.cpp4
-rw-r--r--src/core/helpers/Utils.h2
-rw-r--r--src/core/helpers/WindowHelpers.cpp163
-rw-r--r--src/core/helpers/WindowHelpers.h57
-rw-r--r--src/core/utils/ActivationFunctionUtils.cpp36
-rw-r--r--src/core/utils/AssemblyUtils.cpp14
-rw-r--r--src/core/utils/AssemblyUtils.h3
-rw-r--r--src/core/utils/DataLayoutUtils.cpp9
-rw-r--r--src/core/utils/DataTypeUtils.cpp54
-rw-r--r--src/core/utils/FormatUtils.cpp30
-rw-r--r--src/core/utils/InterpolationPolicyUtils.cpp9
-rw-r--r--src/core/utils/ScaleUtils.cpp15
-rw-r--r--src/core/utils/ScaleUtils.h7
-rw-r--r--src/core/utils/StringUtils.cpp16
-rw-r--r--src/core/utils/helpers/fft.cpp19
-rw-r--r--src/core/utils/helpers/float_ops.h3
-rw-r--r--src/core/utils/helpers/tensor_info.h14
-rw-r--r--src/core/utils/helpers/tensor_transform.cpp63
-rw-r--r--src/core/utils/io/FileHandler.cpp7
-rw-r--r--src/core/utils/logging/FilePrinter.cpp5
-rw-r--r--src/core/utils/logging/Helpers.cpp13
-rw-r--r--src/core/utils/logging/Logger.cpp17
-rw-r--r--src/core/utils/logging/LoggerRegistry.cpp18
-rw-r--r--src/core/utils/misc/MMappedFile.cpp26
-rw-r--r--src/core/utils/quantization/AsymmHelpers.cpp72
-rw-r--r--src/core/utils/quantization/AsymmHelpers.h7
286 files changed, 9956 insertions, 7901 deletions
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index ca2f7d238f..52be6990ab 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -28,12 +28,14 @@
using namespace arm_compute;
-AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info)
- : _info(info)
+AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info) : _info(info)
{
}
-ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window,
+ ValidRegion input_valid_region,
+ bool border_undefined,
+ BorderSize border_size) const
{
ARM_COMPUTE_UNUSED(window);
ARM_COMPUTE_UNUSED(input_valid_region);
@@ -45,17 +47,17 @@ ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window,
ValidRegion AccessWindowAutoPadding::compute_valid_region() const
{
- if(_info == nullptr)
+ if (_info == nullptr)
{
return ValidRegion{};
}
- return ValidRegion{ Coordinates(), _info->tensor_shape() };
+ return ValidRegion{Coordinates(), _info->tensor_shape()};
}
void AccessWindowAutoPadding::set_valid_region()
{
- if(_info == nullptr)
+ if (_info == nullptr)
{
return;
}
@@ -75,7 +77,7 @@ bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window)
ARM_COMPUTE_UNUSED(window);
// Only update the padding if the tensor allows it
- if(_info == nullptr || !_info->is_resizable())
+ if (_info == nullptr || !_info->is_resizable())
{
return false;
}
diff --git a/src/core/AccessWindowAutoPadding.h b/src/core/AccessWindowAutoPadding.h
index b8d1508679..406bdba0d8 100644
--- a/src/core/AccessWindowAutoPadding.h
+++ b/src/core/AccessWindowAutoPadding.h
@@ -74,9 +74,12 @@ public:
ValidRegion compute_valid_region() const;
// Inherited methods overridden:
- bool update_window_if_needed(Window &window) const override;
- bool update_padding_if_needed(const Window &window) override;
- ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+ bool update_window_if_needed(Window &window) const override;
+ bool update_padding_if_needed(const Window &window) override;
+ ValidRegion compute_valid_region(const Window &window,
+ ValidRegion input_valid_region,
+ bool border_undefined,
+ BorderSize border_size) const override;
private:
ITensorInfo *_info;
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 0607011bc5..98182b1202 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp
@@ -34,7 +34,10 @@ AccessWindowStatic::AccessWindowStatic(ITensorInfo *info, int start_x, int start
{
}
-ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowStatic::compute_valid_region(const Window &window,
+ ValidRegion input_valid_region,
+ bool border_undefined,
+ BorderSize border_size) const
{
ARM_COMPUTE_UNUSED(border_undefined);
ARM_COMPUTE_UNUSED(border_size);
@@ -44,7 +47,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region) const
{
- if(_info == nullptr)
+ if (_info == nullptr)
{
return input_valid_region;
}
@@ -57,7 +60,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
// Start of the valid region is equal to the start of the static access but
// never outside of the tensor.
anchor.set(0, std::max<int>(0, _start_x));
- if(_info->num_dimensions() > 1)
+ if (_info->num_dimensions() > 1)
{
anchor.set(1, std::max<int>(0, _start_y));
}
@@ -65,7 +68,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
// End of the valid region is equal to the end of the static access but
// never outside of the tensor.
shape.set(0, std::min<int>(_end_x, _info->tensor_shape()[0]));
- if(_info->num_dimensions() > 1)
+ if (_info->num_dimensions() > 1)
{
shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
}
@@ -75,7 +78,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegion &input_valid_region)
{
- if(_info != nullptr)
+ if (_info != nullptr)
{
_info->set_valid_region(compute_valid_region(window, input_valid_region));
}
@@ -84,7 +87,7 @@ void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegio
bool AccessWindowStatic::update_window_if_needed(Window &window) const
{
// If the padding is not enough and the tensor is not resizable, shrink the window to size 0
- if(_info == nullptr || _info->is_resizable())
+ if (_info == nullptr || _info->is_resizable())
{
return false;
}
@@ -96,48 +99,50 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const
bool window_modified = false;
// Calculate if padding is enough
- if(_start_y < 0)
+ if (_start_y < 0)
{
const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
- if(_start_y < front_pad_y_available)
+ if (_start_y < front_pad_y_available)
{
window_modified = true;
}
}
- if(!window_modified)
+ if (!window_modified)
{
- if(_end_y > static_cast<int>(shape[1]))
+ if (_end_y > static_cast<int>(shape[1]))
{
const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
const int tail_pad_y_available = (stride_z / strides[1]) - shape[1];
- if(static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
+ if (static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
{
window_modified = true;
}
}
- if(!window_modified)
+ if (!window_modified)
{
const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
- if(_start_x < 0)
+ if (_start_x < 0)
{
- const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+ const int front_pad_x_available =
+ -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) /
+ static_cast<int>(strides[0]);
- if(_start_x < front_pad_x_available)
+ if (_start_x < front_pad_x_available)
{
window_modified = true;
}
}
- if(!window_modified && _end_x > static_cast<int>(shape[0]))
+ if (!window_modified && _end_x > static_cast<int>(shape[0]))
{
const int tail_pad_x_available = (stride_y / strides[0]) - shape[0];
- if(static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
+ if (static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
{
window_modified = true;
}
@@ -146,9 +151,9 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const
}
// If padding is not enough
- if(window_modified)
+ if (window_modified)
{
- for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+ for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
{
window.set(i, Window::Dimension(0, 0, 1));
}
@@ -162,7 +167,7 @@ bool AccessWindowStatic::update_padding_if_needed(const Window &window)
ARM_COMPUTE_UNUSED(window);
// Only update the padding if the tensor allows it
- if(_info == nullptr || !_info->is_resizable())
+ if (_info == nullptr || !_info->is_resizable())
{
return false;
}
diff --git a/src/core/AccessWindowStatic.h b/src/core/AccessWindowStatic.h
index f7d43cbb55..5c6d2c7db0 100644
--- a/src/core/AccessWindowStatic.h
+++ b/src/core/AccessWindowStatic.h
@@ -86,9 +86,12 @@ public:
ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const;
// Inherited methods overriden:
- bool update_window_if_needed(Window &window) const override;
- bool update_padding_if_needed(const Window &window) override;
- ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+ bool update_window_if_needed(Window &window) const override;
+ bool update_padding_if_needed(const Window &window) override;
+ ValidRegion compute_valid_region(const Window &window,
+ ValidRegion input_valid_region,
+ bool border_undefined,
+ BorderSize border_size) const override;
private:
ITensorInfo *_info;
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index d8bd4c4de1..42f0081c14 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -29,9 +29,12 @@
using namespace arm_compute;
-ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window,
+ ValidRegion input_valid_region,
+ bool border_undefined,
+ BorderSize border_size) const
{
- if(_info == nullptr)
+ if (_info == nullptr)
{
return input_valid_region;
}
@@ -41,7 +44,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
Coordinates old_anchor(anchor);
TensorShape old_shape(shape);
- if(!border_undefined)
+ if (!border_undefined)
{
border_size = BorderSize(0);
}
@@ -53,7 +56,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
// the kernel to write back output values.
// As the relation between input and output is transposed window.y() is
// used for x anchor and window.x() for y anchor.
- if(_info->dimension(0) > 1)
+ if (_info->dimension(0) > 1)
{
anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
}
@@ -69,15 +72,19 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
// a size of the region.
// As the relation between input and output is transposed window.y() is
// used for x shape and window.x() for y shape.
- if(_info->dimension(0) > 1)
+ if (_info->dimension(0) > 1)
{
- shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+ shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right,
+ (window.y().end() - window.y().step()) * _scale_x + _width) -
+ anchor[0]);
}
- shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+ shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom,
+ (window.x().end() - window.x().step()) * _scale_y + _height) -
+ anchor[1]);
// For higher dimensions use the intersection of the window size and the
// valid region of the input
- for(size_t d = 2; d < _info->num_dimensions(); ++d)
+ for (size_t d = 2; d < _info->num_dimensions(); ++d)
{
anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
@@ -89,7 +96,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
bool AccessWindowTranspose::update_window_if_needed(Window &window) const
{
// Only update the window size if we can't use padding
- if(_info == nullptr || _info->is_resizable())
+ if (_info == nullptr || _info->is_resizable())
{
return false;
}
@@ -107,12 +114,12 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
const int max_y = window.x().end() * _scale_y + _y;
// Adjust window start for output's Y dimension (so X in (input) window)
- if(min_y < 0)
+ if (min_y < 0)
{
// Calculate rows available above the tensor
const int front_pad_y_available = -offset_first_element / strides[1];
- if(min_y < front_pad_y_available)
+ if (min_y < front_pad_y_available)
{
// Not enough padding available, need to shrink the window
const int start = adjust_up(min_y, front_pad_y_available, window.x().step() * _scale_y) - _y;
@@ -126,17 +133,18 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
}
// Adjust window end for Y dimension
- if(max_y > static_cast<int>(shape[1]))
+ if (max_y > static_cast<int>(shape[1]))
{
const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
// Calculate rows available below the tensor
const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
- if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+ if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
{
// Not enough padding available, need to shrink the window
- const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) + window.x().step() * _scale_y - _y - _height;
+ const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) +
+ window.x().step() * _scale_y - _y - _height;
window.set(0, Window::Dimension(window.x().start(), end / _scale_y, window.x().step()));
window_modified = true;
}
@@ -151,11 +159,14 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
// Adjust window start for X dimension
- if(min_x < 0)
+ if (min_x < 0)
{
- const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+ const int front_pad_x_available =
+ -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1],
+ stride_y - shape[0] * strides[0]) /
+ static_cast<int>(strides[0]);
- if(min_x < front_pad_x_available)
+ if (min_x < front_pad_x_available)
{
// Not enough padding available, need to shrink the window
const int start = adjust_up(min_x, front_pad_x_available, window.y().step() * _scale_x) - _x;
@@ -168,14 +179,15 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
}
// Adjust window end for X dimension
- if(max_x > static_cast<int>(shape[0]))
+ if (max_x > static_cast<int>(shape[0]))
{
const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
- if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+ if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
{
// Not enough padding available, need to shrink the window
- const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) + window.y().step() * _scale_x - _x - _width;
+ const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) +
+ window.y().step() * _scale_x - _x - _width;
window.set(1, Window::Dimension(window.y().start(), end / _scale_x, window.y().step()));
window_modified = true;
}
@@ -189,7 +201,7 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
bool AccessWindowTranspose::update_padding_if_needed(const Window &window)
{
// Only update the padding if the tensor allows it
- if(_info == nullptr || !_info->is_resizable())
+ if (_info == nullptr || !_info->is_resizable())
{
return false;
}
diff --git a/src/core/AccessWindowTranspose.h b/src/core/AccessWindowTranspose.h
index 0306076d6e..12bb9a535b 100644
--- a/src/core/AccessWindowTranspose.h
+++ b/src/core/AccessWindowTranspose.h
@@ -42,7 +42,10 @@ public:
bool update_window_if_needed(Window &window) const override;
bool update_padding_if_needed(const Window &window) override;
using AccessWindowRectangle::compute_valid_region;
- ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+ ValidRegion compute_valid_region(const Window &window,
+ ValidRegion input_valid_region,
+ bool border_undefined,
+ BorderSize border_size) const override;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H*/
diff --git a/src/core/CL/CLCommandBuffer.cpp b/src/core/CL/CLCommandBuffer.cpp
index 7fcfdf2c89..d094dcdaea 100644
--- a/src/core/CL/CLCommandBuffer.cpp
+++ b/src/core/CL/CLCommandBuffer.cpp
@@ -38,7 +38,7 @@ std::unique_ptr<CLCommandBuffer> CLCommandBuffer::create(cl_command_queue queue)
const auto &cl_device = CLKernelLibrary::get().get_device();
const auto has_mutable_dispatch = command_buffer_mutable_dispatch_supported(cl_device);
- if(has_mutable_dispatch)
+ if (has_mutable_dispatch)
{
return std::make_unique<CLMutableCommandBuffer>(queue);
}
diff --git a/src/core/CL/CLCommandBuffer.h b/src/core/CL/CLCommandBuffer.h
index 8a94e389fa..90e434161e 100644
--- a/src/core/CL/CLCommandBuffer.h
+++ b/src/core/CL/CLCommandBuffer.h
@@ -87,7 +87,8 @@ public:
* @param[in] global The global work size.
* @param[in] local The local work size.
*/
- virtual void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0;
+ virtual void
+ add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0;
/** Add the mutable argument to the current kernel enqueue command.
*
@@ -154,7 +155,7 @@ protected:
CLCommandBuffer &state(State state);
private:
- State _state{ State::Created };
+ State _state{State::Created};
};
} // namespace arm_compute
diff --git a/src/core/CL/CLCompatCommandBuffer.cpp b/src/core/CL/CLCompatCommandBuffer.cpp
index f1a902c7b9..242fd7719c 100644
--- a/src/core/CL/CLCompatCommandBuffer.cpp
+++ b/src/core/CL/CLCompatCommandBuffer.cpp
@@ -31,8 +31,7 @@
namespace arm_compute
{
-CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue)
- : _queue(queue)
+CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) : _queue(queue)
{
}
@@ -40,11 +39,14 @@ CLCompatCommandBuffer::~CLCompatCommandBuffer()
{
}
-void CLCompatCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local)
+void CLCompatCommandBuffer::add_kernel(cl_kernel kernel,
+ const cl::NDRange &offset,
+ const cl::NDRange &global,
+ const cl::NDRange &local)
{
ARM_COMPUTE_ERROR_ON(state() != State::Created);
- _kernel_cmds.push_back(KernelCommand{ kernel, offset, global, local, {} });
+ _kernel_cmds.push_back(KernelCommand{kernel, offset, global, local, {}});
}
void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size)
@@ -52,7 +54,7 @@ void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const
ARM_COMPUTE_ERROR_ON(state() != State::Created);
ARM_COMPUTE_ERROR_ON(_kernel_cmds.empty());
- _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{ arg_idx, size, value });
+ _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{arg_idx, size, value});
}
void CLCompatCommandBuffer::finalize()
@@ -61,7 +63,7 @@ void CLCompatCommandBuffer::finalize()
_kernel_cmds.shrink_to_fit();
- for(auto &cmd : _kernel_cmds)
+ for (auto &cmd : _kernel_cmds)
{
cmd.mutable_args.shrink_to_fit();
}
@@ -80,25 +82,19 @@ void CLCompatCommandBuffer::enqueue()
{
ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
- for(const auto &cmd : _kernel_cmds)
+ for (const auto &cmd : _kernel_cmds)
{
- for(const auto &arg : cmd.mutable_args)
+ for (const auto &arg : cmd.mutable_args)
{
const auto error = clSetKernelArg(cmd.kernel, arg.arg_index, arg.arg_size, arg.arg_value);
handle_cl_error("clSetKernelArg", error);
}
- const auto error = clEnqueueNDRangeKernel(
- _queue,
- cmd.kernel,
- static_cast<cl_uint>(cmd.global.dimensions()),
- cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr,
- cmd.global.get(),
- cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr,
- 0,
- nullptr,
- nullptr);
+ const auto error =
+ clEnqueueNDRangeKernel(_queue, cmd.kernel, static_cast<cl_uint>(cmd.global.dimensions()),
+ cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, cmd.global.get(),
+ cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, 0, nullptr, nullptr);
handle_cl_error("clEnqueueNDRangeKernel", error);
}
diff --git a/src/core/CL/CLCompatCommandBuffer.h b/src/core/CL/CLCompatCommandBuffer.h
index e91d52d2d6..d5df106425 100644
--- a/src/core/CL/CLCompatCommandBuffer.h
+++ b/src/core/CL/CLCompatCommandBuffer.h
@@ -57,7 +57,10 @@ public:
/** Disallow move assignment. */
CLCompatCommandBuffer &operator=(CLCompatCommandBuffer &&) = delete;
- void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override;
+ void add_kernel(cl_kernel kernel,
+ const cl::NDRange &offset,
+ const cl::NDRange &global,
+ const cl::NDRange &local) override;
void finalize() override;
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index 2d024f9c2f..9bbc32657e 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -22,19 +22,19 @@
* SOFTWARE.
*/
#include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Utils.h"
+
#include "support/StringSupport.h"
#include <regex>
namespace arm_compute
{
-CLBuildOptions::CLBuildOptions()
- : _build_opts()
+CLBuildOptions::CLBuildOptions() : _build_opts()
{
}
@@ -45,7 +45,7 @@ void CLBuildOptions::add_option(std::string option)
void CLBuildOptions::add_option_if(bool cond, std::string option)
{
- if(cond)
+ if (cond)
{
add_option(std::move(option));
}
@@ -63,7 +63,7 @@ void CLBuildOptions::add_options(const StringSet &options)
void CLBuildOptions::add_options_if(bool cond, const StringSet &options)
{
- if(cond)
+ if (cond)
{
add_options(options);
}
@@ -79,26 +79,35 @@ bool CLBuildOptions::operator==(const CLBuildOptions &other) const
return _build_opts == other._build_opts;
}
-Program::Program()
- : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
+Program::Program() : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
{
}
Program::Program(cl::Context context, std::string name, std::string source)
- : _context(std::move(context)), _device(), _is_binary(false), _name(std::move(name)), _source(std::move(source)), _binary()
+ : _context(std::move(context)),
+ _device(),
+ _is_binary(false),
+ _name(std::move(name)),
+ _source(std::move(source)),
+ _binary()
{
}
Program::Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary)
- : _context(std::move(context)), _device(std::move(device)), _is_binary(true), _name(std::move(name)), _source(), _binary(std::move(binary))
+ : _context(std::move(context)),
+ _device(std::move(device)),
+ _is_binary(true),
+ _name(std::move(name)),
+ _source(),
+ _binary(std::move(binary))
{
}
Program::operator cl::Program() const
{
- if(_is_binary)
+ if (_is_binary)
{
- return cl::Program(_context, { _device }, { _binary });
+ return cl::Program(_context, {_device}, {_binary});
}
else
{
@@ -112,12 +121,12 @@ bool Program::build(const cl::Program &program, const std::string &build_options
{
return program.build(build_options.c_str()) == CL_SUCCESS;
}
- catch(const cl::Error &e)
+ catch (const cl::Error &e)
{
cl_int err = CL_SUCCESS;
const auto build_info = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&err);
- for(auto &pair : build_info)
+ for (auto &pair : build_info)
{
std::cerr << pair.second << std::endl;
}
@@ -133,14 +142,12 @@ cl::Program Program::build(const std::string &build_options) const
return cl_program;
}
-Kernel::Kernel()
- : _name(), _kernel()
+Kernel::Kernel() : _name(), _kernel()
{
}
Kernel::Kernel(std::string name, const cl::Program &program)
- : _name(std::move(name)),
- _kernel(cl::Kernel(program, _name.c_str()))
+ : _name(std::move(name)), _kernel(cl::Kernel(program, _name.c_str()))
{
}
CLCompileContext::CLCompileContext()
@@ -156,15 +163,19 @@ CLCompileContext::CLCompileContext(cl::Context context, const cl::Device &device
_is_wbsm_supported = get_wbsm_support_info(device);
}
-Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source,
- const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const
+Kernel CLCompileContext::create_kernel(const std::string &kernel_name,
+ const std::string &program_name,
+ const std::string &program_source,
+ const std::string &kernel_path,
+ const StringSet &build_options_set,
+ bool is_binary) const
{
const std::string build_options = generate_build_options(build_options_set, kernel_path);
const std::string built_program_name = program_name + "_" + build_options;
auto built_program_it = _built_programs_map.find(built_program_name);
cl::Program cl_program;
- if(_built_programs_map.end() != built_program_it)
+ if (_built_programs_map.end() != built_program_it)
{
// If program has been built, retrieve to create kernel from it
cl_program = built_program_it->second;
@@ -184,11 +195,12 @@ Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std
return Kernel(kernel_name, cl_program);
}
-const Program &CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const
+const Program &
+CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const
{
const auto program_it = _programs_map.find(program_name);
- if(program_it != _programs_map.end())
+ if (program_it != _programs_map.end())
{
return program_it->second;
}
@@ -199,9 +211,10 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c
ARM_COMPUTE_UNUSED(is_binary);
program = Program(_context, program_name, program_source);
#else /* EMBEDDED_KERNELS */
- if(is_binary)
+ if (is_binary)
{
- program = Program(_context, _device.cl_device(), program_name, std::vector<unsigned char>(program_source.begin(), program_source.end()));
+ program = Program(_context, _device.cl_device(), program_name,
+ std::vector<unsigned char>(program_source.begin(), program_source.end()));
}
else
{
@@ -218,18 +231,19 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c
void CLCompileContext::set_context(cl::Context context)
{
_context = std::move(context);
- if(_context.get() != nullptr)
+ if (_context.get() != nullptr)
{
const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
- if(!cl_devices.empty())
+ if (!cl_devices.empty())
{
_device = CLDevice(cl_devices[0]);
}
}
}
-std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const
+std::string CLCompileContext::generate_build_options(const StringSet &build_options_set,
+ const std::string &kernel_path) const
{
std::string concat_str;
bool ext_supported = false;
@@ -241,27 +255,27 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti
#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
GPUTarget gpu_arch = get_arch_from_target(_device.target());
- concat_str += " -DGPU_ARCH=" + support::cpp11::to_string(
- static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch));
+ concat_str +=
+ " -DGPU_ARCH=" + support::cpp11::to_string(static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch));
- if(_device.supported("cl_khr_fp16"))
+ if (_device.supported("cl_khr_fp16"))
{
concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
}
- if(_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product"))
+ if (_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product"))
{
concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ENABLED=1 ";
}
- if(_device.supported("cl_arm_integer_dot_product_accumulate_int8"))
+ if (_device.supported("cl_arm_integer_dot_product_accumulate_int8"))
{
concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED=1 ";
}
std::tie(ext_supported, ext_buildopts) = _device.is_non_uniform_workgroup_supported();
- if(ext_supported)
+ if (ext_supported)
{
concat_str += ext_buildopts;
}
@@ -270,7 +284,7 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti
ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
}
- if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11)
+ if (gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11)
{
concat_str += " -DUNROLL_WITH_PRAGMA ";
}
@@ -295,7 +309,7 @@ std::string CLCompileContext::stringify_set(const StringSet &s, const std::strin
#endif /* EMBEDDED_KERNELS */
// Concatenate set
- for(const auto &el : s)
+ for (const auto &el : s)
{
concat_set += " " + el;
}
@@ -340,7 +354,7 @@ cl::NDRange CLCompileContext::default_ndrange() const
GPUTarget _target = get_target_from_device(_device.cl_device());
cl::NDRange default_range;
- switch(_target)
+ switch (_target)
{
case GPUTarget::MIDGARD:
case GPUTarget::T600:
@@ -370,7 +384,8 @@ size_t CLCompileContext::max_local_workgroup_size(const cl::Kernel &kernel) cons
size_t result;
size_t err = kernel.getWorkGroupInfo(_device.cl_device(), CL_KERNEL_WORK_GROUP_SIZE, &result);
- ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+ ARM_COMPUTE_ERROR_ON_MSG(err != 0,
+ "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
ARM_COMPUTE_UNUSED(err);
return result;
@@ -392,7 +407,7 @@ int32_t CLCompileContext::get_ddk_version() const
const std::regex ddk_regex("r([0-9]*)p[0-9]");
std::smatch ddk_match;
- if(std::regex_search(device_version, ddk_match, ddk_regex))
+ if (std::regex_search(device_version, ddk_match, ddk_regex))
{
return std::stoi(ddk_match[1]);
}
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 78f36100d5..5ea99d360a 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -22,14 +22,15 @@
* SOFTWARE.
*/
#include "arm_compute/core/CL/CLHelpers.h"
+
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLTypes.h"
-#include "arm_compute/core/utils/DataTypeUtils.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Log.h"
#include "arm_compute/core/Types.h"
-#include "src/gpu/cl/ClCompileContext.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "src/gpu/cl/ClCompileContext.h"
#include "src/gpu/cl/ClKernelLibrary.h"
#include <utility>
@@ -39,7 +40,7 @@ namespace arm_compute
{
std::string get_cl_type_from_data_type(const DataType &dt)
{
- switch(dt)
+ switch (dt)
{
case DataType::U8:
case DataType::QASYMM8:
@@ -75,7 +76,7 @@ std::string get_cl_type_from_data_type(const DataType &dt)
std::string get_cl_promoted_type_from_data_type(const DataType &dt)
{
- switch(dt)
+ switch (dt)
{
case DataType::U8:
case DataType::QASYMM8:
@@ -105,7 +106,7 @@ std::string get_cl_promoted_type_from_data_type(const DataType &dt)
std::string get_cl_unsigned_type_from_element_size(size_t element_size)
{
- switch(element_size)
+ switch (element_size)
{
case 1:
return "uchar";
@@ -123,7 +124,7 @@ std::string get_cl_unsigned_type_from_element_size(size_t element_size)
std::string get_cl_signed_type_from_element_size(size_t element_size)
{
- switch(element_size)
+ switch (element_size)
{
case 1:
return "char";
@@ -141,7 +142,7 @@ std::string get_cl_signed_type_from_element_size(size_t element_size)
std::string get_cl_select_type_from_data_type(const DataType &dt)
{
- switch(dt)
+ switch (dt)
{
case DataType::U8:
case DataType::QASYMM8:
@@ -174,7 +175,7 @@ std::string get_cl_select_type_from_data_type(const DataType &dt)
std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt)
{
- switch(dt)
+ switch (dt)
{
case DataType::U8:
case DataType::QASYMM8:
@@ -192,7 +193,7 @@ std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt)
std::string get_data_size_from_data_type(const DataType &dt)
{
- switch(dt)
+ switch (dt)
{
case DataType::U8:
case DataType::S8:
@@ -244,8 +245,9 @@ bool dot8_supported(const cl::Device &device)
const GPUTarget gpu_target = get_target_from_name(device_name);
// SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8
- std::set<GPUTarget> sw_workaround_issue = { GPUTarget::G76 };
- return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0);
+ std::set<GPUTarget> sw_workaround_issue = {GPUTarget::G76};
+ return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") ||
+ sw_workaround_issue.count(gpu_target) != 0);
}
bool dot8_acc_supported(const cl::Device &device)
@@ -256,23 +258,23 @@ bool dot8_acc_supported(const cl::Device &device)
CLVersion get_cl_version(const cl::Device &device)
{
std::string version_str = device.getInfo<CL_DEVICE_VERSION>();
- if(version_str.find("OpenCL 3") != std::string::npos)
+ if (version_str.find("OpenCL 3") != std::string::npos)
{
return CLVersion::CL30;
}
- else if(version_str.find("OpenCL 2") != std::string::npos)
+ else if (version_str.find("OpenCL 2") != std::string::npos)
{
return CLVersion::CL20;
}
- else if(version_str.find("OpenCL 1.2") != std::string::npos)
+ else if (version_str.find("OpenCL 1.2") != std::string::npos)
{
return CLVersion::CL12;
}
- else if(version_str.find("OpenCL 1.1") != std::string::npos)
+ else if (version_str.find("OpenCL 1.1") != std::string::npos)
{
return CLVersion::CL11;
}
- else if(version_str.find("OpenCL 1.0") != std::string::npos)
+ else if (version_str.find("OpenCL 1.0") != std::string::npos)
{
return CLVersion::CL10;
}
@@ -287,14 +289,15 @@ bool device_supports_extension(const cl::Device &device, const char *extension_n
return (pos != std::string::npos);
}
-bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout)
+bool cl_winograd_convolution_layer_supported(const Size2D &output_tile,
+ const Size2D &kernel_size,
+ DataLayout data_layout)
{
ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
- std::vector<WinogradConfiguration> winograd_configs_nchw =
- {
+ std::vector<WinogradConfiguration> winograd_configs_nchw = {
WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3)),
WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1)),
@@ -303,11 +306,9 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si
WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)),
- WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))
- };
+ WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))};
- std::vector<WinogradConfiguration> winograd_configs_nhwc =
- {
+ std::vector<WinogradConfiguration> winograd_configs_nhwc = {
WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)),
WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)),
@@ -324,19 +325,21 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si
std::pair<int, int>(kernel_size.width, kernel_size.height));
// Return true if supported
- if(data_layout == DataLayout::NCHW)
+ if (data_layout == DataLayout::NCHW)
{
- return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != winograd_configs_nchw.end());
+ return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) !=
+ winograd_configs_nchw.end());
}
else
{
- return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end());
+ return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) !=
+ winograd_configs_nhwc.end());
}
}
size_t preferred_vector_width(const cl::Device &device, const DataType dt)
{
- switch(dt)
+ switch (dt)
{
case DataType::U8:
case DataType::S8:
@@ -382,7 +385,7 @@ size_t get_cl_image_pitch_alignment(const cl::Device &device)
cl_int err = clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &pixel_aligment, nullptr);
- if(err == CL_SUCCESS)
+ if (err == CL_SUCCESS)
{
return pixel_aligment;
}
@@ -396,12 +399,14 @@ bool get_cl_non_uniform_work_group_supported(const cl::Device &device)
{
cl_bool supported = CL_FALSE;
- cl_int err = clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr);
+ cl_int err =
+ clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr);
return (err == CL_SUCCESS && supported == CL_TRUE);
}
-cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts)
+cl::Kernel
+create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts)
{
opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
@@ -409,7 +414,8 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_
auto kernel_src = klib.program(program_name);
const std::string kernel_path = klib.kernel_path();
- return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path, build_opts, kernel_src.is_binary));
+ return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path,
+ build_opts, kernel_src.is_binary));
}
cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size)
@@ -423,8 +429,9 @@ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimensio
bool get_wbsm_support_info(const cl::Device &device)
{
cl_bitfield capabilities = 0;
- cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr);
- if((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM))
+ cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield),
+ &capabilities, nullptr);
+ if ((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM))
{
return true;
}
@@ -433,35 +440,33 @@ bool get_wbsm_support_info(const cl::Device &device)
void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint)
{
- cl_int err = clSetKernelExecInfo(kernel.get(),
- CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM,
- sizeof(cl_int),
- &wbsm_hint);
+ cl_int err = clSetKernelExecInfo(kernel.get(), CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM,
+ sizeof(cl_int), &wbsm_hint);
ARM_COMPUTE_UNUSED(err);
ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS);
}
bool export_to_cl_image(const ITensorInfo *tensor)
{
- if(tensor->tensor_shape()[0] % 4 != 0)
+ if (tensor->tensor_shape()[0] % 4 != 0)
{
return false;
}
// If not floating point
- if(!is_data_type_float(tensor->data_type()))
+ if (!is_data_type_float(tensor->data_type()))
{
return false;
}
// Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
- if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+ if (!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
{
return false;
}
// Check cl image pitch alignment
- if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
+ if (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
{
return false;
}
@@ -471,7 +476,7 @@ bool export_to_cl_image(const ITensorInfo *tensor)
const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
- if(image_w > max_image_w || image_h > max_image_h)
+ if (image_w > max_image_w || image_h > max_image_h)
{
return false;
}
@@ -481,9 +486,9 @@ bool export_to_cl_image(const ITensorInfo *tensor)
void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values)
{
- for(const int value : values)
+ for (const int value : values)
{
- if(value > max_manual_loop_unrolling)
+ if (value > max_manual_loop_unrolling)
{
built_opts.add_option("-DUNROLL_WITH_PRAGMA");
return;
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index c5a0796c3a..e69d006750 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -22,8 +22,11 @@
* SOFTWARE.
*/
#include "arm_compute/core/CL/CLKernelLibrary.h"
+
#include "arm_compute/core/Error.h"
+
#include "src/gpu/cl/ClKernelLibrary.h"
+
#include <algorithm>
#include <array>
#include <fstream>
@@ -31,8 +34,7 @@
#include <vector>
namespace arm_compute
{
-CLKernelLibrary::CLKernelLibrary()
- : _compile_context()
+CLKernelLibrary::CLKernelLibrary() : _compile_context()
{
opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the CLKernelLibrary is built
}
@@ -41,13 +43,15 @@ CLKernelLibrary &CLKernelLibrary::get()
static CLKernelLibrary _kernel_library;
return _kernel_library;
}
-Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const std::set<std::string> &build_options_set) const
+Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name,
+ const std::set<std::string> &build_options_set) const
{
const opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
const std::string program_name = klib.program_name(kernel_name);
auto program = klib.program(program_name);
const std::string &kernel_path = CLKernelLibrary::get().get_kernel_path();
- return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set, program.is_binary);
+ return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set,
+ program.is_binary);
}
std::string CLKernelLibrary::get_program_name(const std::string &kernel_name) const
{
@@ -131,4 +135,4 @@ CLCompileContext &CLKernelLibrary::get_compile_context()
{
return _compile_context;
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp
index b9c59ac6f0..05b351fc25 100644
--- a/src/core/CL/CLMutableCommandBuffer.cpp
+++ b/src/core/CL/CLMutableCommandBuffer.cpp
@@ -31,8 +31,7 @@
namespace arm_compute
{
-CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue)
- : CLCommandBuffer()
+CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLCommandBuffer()
{
cl_int status = CL_SUCCESS;
@@ -52,7 +51,10 @@ CLMutableCommandBuffer::~CLMutableCommandBuffer()
handle_cl_error("clReleaseCommandBufferKHR", status);
}
-void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local)
+void CLMutableCommandBuffer::add_kernel(cl_kernel kernel,
+ const cl::NDRange &offset,
+ const cl::NDRange &global,
+ const cl::NDRange &local)
{
ARM_COMPUTE_ERROR_ON(state() != State::Created);
@@ -65,18 +67,8 @@ void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &off
};
const auto error = clCommandNDRangeKernelKHR(
- _cb,
- nullptr,
- properties,
- kernel,
- global.dimensions(),
- offset.dimensions() != 0 ? offset.get() : nullptr,
- global.get(),
- local.dimensions() != 0 ? local.get() : nullptr,
- 0,
- nullptr,
- nullptr,
- &mutable_handle);
+ _cb, nullptr, properties, kernel, global.dimensions(), offset.dimensions() != 0 ? offset.get() : nullptr,
+ global.get(), local.dimensions() != 0 ? local.get() : nullptr, 0, nullptr, nullptr, &mutable_handle);
handle_cl_error("clCommandNDRangeKernelKHR", error);
@@ -114,7 +106,7 @@ void CLMutableCommandBuffer::finalize()
size_t arg_no = 0;
- for(auto &mut_dispatch_cfg : _mut_dispatch_cfgs)
+ for (auto &mut_dispatch_cfg : _mut_dispatch_cfgs)
{
ARM_COMPUTE_ERROR_ON(arg_no >= _mut_arg_cfgs.size());
mut_dispatch_cfg.arg_list = &_mut_arg_cfgs[arg_no];
@@ -132,9 +124,7 @@ void CLMutableCommandBuffer::update()
{
ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
- const auto error = clUpdateMutableCommandsKHR(
- _cb,
- &_mut_cfg);
+ const auto error = clUpdateMutableCommandsKHR(_cb, &_mut_cfg);
handle_cl_error("clUpdateMutableCommandsKHR", error);
}
@@ -143,13 +133,7 @@ void CLMutableCommandBuffer::enqueue()
{
ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
- const auto error = clEnqueueCommandBufferKHR(
- 0,
- nullptr,
- _cb,
- 0,
- nullptr,
- nullptr);
+ const auto error = clEnqueueCommandBufferKHR(0, nullptr, _cb, 0, nullptr, nullptr);
handle_cl_error("clEnqueueCommandBufferKHR", error);
}
diff --git a/src/core/CL/CLMutableCommandBuffer.h b/src/core/CL/CLMutableCommandBuffer.h
index 04e94b0bb2..8997d7d1fd 100644
--- a/src/core/CL/CLMutableCommandBuffer.h
+++ b/src/core/CL/CLMutableCommandBuffer.h
@@ -57,7 +57,10 @@ public:
/** Disallow move assignment. */
CLMutableCommandBuffer &operator=(CLMutableCommandBuffer &&) = delete;
- void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override;
+ void add_kernel(cl_kernel kernel,
+ const cl::NDRange &offset,
+ const cl::NDRange &global,
+ const cl::NDRange &local) override;
void finalize() override;
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 289300b3a1..290ed32648 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -26,9 +26,10 @@
#include "arm_compute/core/CL/CLCompileContext.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
#include "support/StringSupport.h"
namespace arm_compute
@@ -38,15 +39,15 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
const cl::Context &ctx = CLKernelLibrary::get().context();
- const cl::Buffer &buffer = tensor->cl_buffer();
+ const cl::Buffer &buffer = tensor->cl_buffer();
const ITensorInfo *info = tensor->info();
- ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(),
- "Tensor paddings must not be locked to allow extending paddings to satisfy cl_image pitch alignment requirement");
+ ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(), "Tensor paddings must not be locked to allow extending paddings to "
+ "satisfy cl_image pitch alignment requirement");
- const size_t image_w{ info->dimension(0) / 4 };
- const size_t image_h{ info->tensor_shape().total_size() / info->dimension(0) };
- const size_t max_image_w{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>() };
- const size_t max_image_h{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>() };
+ const size_t image_w{info->dimension(0) / 4};
+ const size_t image_h{info->tensor_shape().total_size() / info->dimension(0)};
+ const size_t max_image_w{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>()};
+ const size_t max_image_h{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>()};
ARM_COMPUTE_UNUSED(max_image_w, max_image_h);
ARM_COMPUTE_ERROR_ON_MSG(image_w > max_image_w, "Image width exceeds maximum width for exporting to cl_image");
@@ -58,18 +59,22 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
return create_image2d_from_buffer(ctx, buffer, shape2d, info->data_type(), image_row_pitch, image_type);
}
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type)
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx,
+ const cl::Buffer &buffer,
+ const TensorShape &shape2d,
+ DataType data_type,
+ size_t image_row_pitch,
+ CLImage2DType image_type)
{
ARM_COMPUTE_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()),
"The extension cl_khr_image2d_from_buffer is not supported on the target platform");
ARM_COMPUTE_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0,
"Impossible to retrieve the cl_image pitch alignment");
- ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr,
- "Cannot create cl_image from empty cl_buffer");
+ ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr, "Cannot create cl_image from empty cl_buffer");
cl_channel_type cl_data_type;
- switch(data_type)
+ switch (data_type)
{
case DataType::F32:
cl_data_type = CL_FLOAT;
@@ -84,7 +89,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
cl_mem cl_image;
cl_int err = CL_SUCCESS;
- const cl_image_format format = { CL_RGBA, cl_data_type };
+ const cl_image_format format = {CL_RGBA, cl_data_type};
cl_image_desc desc;
memset(&desc, 0, sizeof(desc));
@@ -94,7 +99,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
desc.image_width = shape2d[0];
desc.image_height = shape2d[1];
- switch(image_type)
+ switch (image_type)
{
case CLImage2DType::ReadOnly:
cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
@@ -114,7 +119,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
void handle_cl_error(const std::string &function_name, cl_int error_code)
{
- if(error_code != CL_SUCCESS)
+ if (error_code != CL_SUCCESS)
{
std::string error_message = function_name + " - Error code: " + std::to_string(error_code);
ARM_COMPUTE_ERROR(error_message.c_str());
diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index de9c1b3194..f9dcfeac3a 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h
@@ -72,7 +72,12 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
*
* @return cl::Image2D object
*/
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type);
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx,
+ const cl::Buffer &buffer,
+ const TensorShape &shape2d,
+ DataType data_type,
+ size_t image_row_pitch,
+ CLImage2DType image_type);
/** Check for CL error code and throw exception accordingly.
*
diff --git a/src/core/CL/CLValidate.h b/src/core/CL/CLValidate.h
index 7b5294e452..50d224f1c0 100644
--- a/src/core/CL/CLValidate.h
+++ b/src/core/CL/CLValidate.h
@@ -29,11 +29,13 @@
namespace arm_compute
{
-#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \
- ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \
+ ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \
+ CLKernelLibrary::get().fp16_supported()))
-#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \
- ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \
+ ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \
+ CLKernelLibrary::get().fp16_supported()))
/** Return an error if int64_base_atomics extension is not supported by the device.
*
@@ -43,11 +45,13 @@ namespace arm_compute
*
* @return Status
*/
-inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
+inline arm_compute::Status
+error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
{
- if(!CLKernelLibrary::get().int64_base_atomics_supported())
+ if (!CLKernelLibrary::get().int64_base_atomics_supported())
{
- return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported");
+ return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line,
+ "Atomic functions are not supported");
}
return arm_compute::Status{};
}
diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp
index a53fdbbab6..f96b24d2a9 100644
--- a/src/core/CL/DefaultLWSHeuristics.cpp
+++ b/src/core/CL/DefaultLWSHeuristics.cpp
@@ -31,13 +31,13 @@ cl::NDRange get_gemm_lws(size_t gws_x, size_t gws_y, size_t gws_z)
{
ARM_COMPUTE_UNUSED(gws_y);
- if(gws_z != 1)
+ if (gws_z != 1)
{
return cl::NDRange(4, 4, 2);
}
else
{
- if(gws_x > 256)
+ if (gws_x > 256)
{
return cl::NDRange(2, 16, 1);
}
@@ -59,9 +59,9 @@ cl::NDRange get_direct_lws(size_t gws_x, size_t gws_y, size_t gws_z)
{
ARM_COMPUTE_UNUSED(gws_z);
- if(gws_x < gws_y)
+ if (gws_x < gws_y)
{
- if(gws_x < 4)
+ if (gws_x < 4)
{
return cl::NDRange(std::min(gws_x, static_cast<size_t>(2u)), 32, 1);
}
@@ -81,7 +81,7 @@ cl::NDRange get_dwc_lws(size_t gws_x, size_t gws_y, size_t gws_z)
ARM_COMPUTE_UNUSED(gws_y);
ARM_COMPUTE_UNUSED(gws_z);
- if(gws_x < 32)
+ if (gws_x < 32)
{
return cl::NDRange(gws_x, 4, 4);
}
@@ -100,7 +100,7 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws)
const size_t gws_y = gws[1];
const size_t gws_z = gws[2];
- switch(kernel_type)
+ switch (kernel_type)
{
case CLKernelType::GEMM:
{
@@ -124,4 +124,4 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws)
}
}
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index dc3a86a528..ac53e7f1d2 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -25,18 +25,23 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
+
#include "src/core/helpers/Utils.h"
#include <cstddef>
-void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items)
+void arm_compute::enqueue(cl::CommandQueue &queue,
+ ICLKernel &kernel,
+ const Window &window,
+ const cl::NDRange &lws_hint,
+ bool use_dummy_work_items)
{
- if(kernel.kernel()() == nullptr)
+ if (kernel.kernel()() == nullptr)
{
return;
}
- for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+ for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
{
ARM_COMPUTE_ERROR_ON(window[i].step() == 0);
// Make sure that dimensions > Z are 1
@@ -46,7 +51,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items);
// Check for empty NDRange
- if(gws.dimensions() == 0)
+ if (gws.dimensions() == 0)
{
return;
}
@@ -54,7 +59,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
kernel.cache_gws(gws);
cl::NDRange valid_lws;
- if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
+ if (lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
{
valid_lws = cl::NullRange;
}
@@ -65,12 +70,12 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
cl::NDRange lws = cl::NullRange;
- if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
+ if ((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
{
lws = valid_lws;
}
- if(CLKernelLibrary::get().is_wbsm_supported())
+ if (CLKernelLibrary::get().is_wbsm_supported())
{
set_wbsm(kernel.kernel(), kernel.wbsm_hint());
}
@@ -90,7 +95,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
// Calculate offset to the start of the window
unsigned int offset_first_element = info->offset_first_element_in_bytes();
- for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+ for (unsigned int n = 0; n < info->num_dimensions(); ++n)
{
offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n];
}
@@ -98,7 +103,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
unsigned int idx_start = idx;
_kernel.setArg(idx++, tensor->cl_buffer());
- for(unsigned int d = 0; d < dimension_size; ++d)
+ for (unsigned int d = 0; d < dimension_size; ++d)
{
_kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : strides[d]);
_kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : (strides[d] * window[d].step()));
@@ -107,7 +112,8 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
_kernel.setArg<cl_uint>(idx++, offset_first_element);
ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
- "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
+ "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel",
+ dimension_size, num_arguments_per_tensor<dimension_size>());
ARM_COMPUTE_UNUSED(idx_start);
}
@@ -178,7 +184,7 @@ void ICLKernel::set_target(cl::Device &device)
size_t ICLKernel::get_max_workgroup_size()
{
- if(_max_workgroup_size == 0)
+ if (_max_workgroup_size == 0)
{
_max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel);
}
@@ -187,7 +193,7 @@ size_t ICLKernel::get_max_workgroup_size()
cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items)
{
- if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
+ if ((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
{
return cl::NullRange;
}
@@ -196,7 +202,7 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work
(window.y().end() - window.y().start()) / window.y().step(),
(window.z().end() - window.z().start()) / window.z().step());
- if(use_dummy_work_items)
+ if (use_dummy_work_items)
{
gws.get()[0] = get_next_power_two(gws[0]);
gws.get()[1] = get_next_power_two(gws[1]);
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index c82809cef3..6aebef15a5 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -27,10 +27,10 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLTypes.h"
#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/GPUTarget.h"
#include "arm_compute/core/IKernel.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/runtime/CL/CLTuningParams.h"
#include "src/core/CL/DefaultLWSHeuristics.h"
@@ -43,14 +43,14 @@ namespace
{
bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1)
{
- if(lws0.dimensions() != lws1.dimensions())
+ if (lws0.dimensions() != lws1.dimensions())
{
return false;
}
- for(size_t i = 0; i < lws0.dimensions(); ++i)
+ for (size_t i = 0; i < lws0.dimensions(); ++i)
{
- if(lws0.get()[i] != lws1.get()[i])
+ if (lws0.get()[i] != lws1.get()[i])
{
return false;
}
@@ -71,7 +71,7 @@ private:
*
* @return The number of arguments enqueued per array object.
*/
- template <unsigned int dimension_size>
+ template <unsigned int dimension_size>
constexpr static unsigned int num_arguments_per_array()
{
return num_arguments_per_tensor<dimension_size>();
@@ -80,7 +80,7 @@ private:
*
* @return The number of arguments enqueued per tensor object.
*/
- template <unsigned int dimension_size>
+ template <unsigned int dimension_size>
constexpr static unsigned int num_arguments_per_tensor()
{
return 2 + 2 * dimension_size;
@@ -116,11 +116,13 @@ protected:
* @param[in] window The maximum window which will be returned by window()
* @param[in] tuning_params_hint (Optional) Tuning parameters to use.
*/
- void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0))
+ void configure_internal(const Window &window,
+ CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(),
+ 0))
{
_tuning_params_hint = tuning_params_hint;
- if(is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
+ if (is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
{
// Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which
// will be recalculated with use_dummy_work_items flag at run time again anyway.
@@ -133,7 +135,13 @@ protected:
public:
/** Constructor */
ICLKernel()
- : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint(), _cached_gws(cl::NullRange)
+ : _kernel(nullptr),
+ _target(GPUTarget::MIDGARD),
+ _config_id(arm_compute::default_config_id),
+ _max_workgroup_size(0),
+ _type(CLKernelType::UNKNOWN),
+ _tuning_params_hint(),
+ _cached_gws(cl::NullRange)
{
}
/** Returns a reference to the OpenCL kernel of this object.
@@ -161,7 +169,11 @@ public:
* @param[in] window Window the kernel will be executed on.
*/
template <typename T>
- void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+ void add_1D_array_argument(unsigned int &idx,
+ const ICLArray<T> *array,
+ const Strides &strides,
+ unsigned int num_dimensions,
+ const Window &window)
{
add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
}
@@ -184,7 +196,7 @@ public:
*/
void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
{
- if(cond)
+ if (cond)
{
add_1D_tensor_argument(idx, tensor, window);
}
@@ -208,7 +220,7 @@ public:
*/
void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
{
- if(cond)
+ if (cond)
{
add_2D_tensor_argument(idx, tensor, window);
}
@@ -469,7 +481,11 @@ private:
* @param[in] window Window the kernel will be executed on.
*/
template <typename T, unsigned int dimension_size>
- void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
+ void add_array_argument(unsigned int &idx,
+ const ICLArray<T> *array,
+ const Strides &strides,
+ unsigned int num_dimensions,
+ const Window &window);
/** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
*
* @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -505,7 +521,11 @@ private:
*
* @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
*/
-void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
+void enqueue(cl::CommandQueue &queue,
+ ICLKernel &kernel,
+ const Window &window,
+ const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(),
+ bool use_dummy_work_items = false);
/** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
*
@@ -516,14 +536,15 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c
* @param[in] window Window the kernel will be executed on.
*/
template <typename T, unsigned int dimension_size>
-void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+void ICLKernel::add_array_argument(
+ unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
{
ARM_COMPUTE_ERROR_ON(array == nullptr);
// Calculate offset to the start of the window
unsigned int offset_first_element = 0;
- for(unsigned int n = 0; n < num_dimensions; ++n)
+ for (unsigned int n = 0; n < num_dimensions; ++n)
{
offset_first_element += window[n].start() * strides[n];
}
@@ -531,7 +552,7 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons
unsigned int idx_start = idx;
_kernel.setArg(idx++, array->cl_buffer());
- for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+ for (unsigned int dimension = 0; dimension < dimension_size; dimension++)
{
_kernel.setArg<cl_uint>(idx++, strides[dimension]);
_kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
@@ -540,8 +561,9 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons
_kernel.setArg<cl_uint>(idx++, offset_first_element);
ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
- "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
+ "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel",
+ dimension_size, num_arguments_per_array<dimension_size>());
ARM_COMPUTE_UNUSED(idx_start);
}
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_ICLKERNEL_H */
diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp
index 5d8295bdfe..3f7edbb88d 100644
--- a/src/core/CL/ICLSimple2DKernel.cpp
+++ b/src/core/CL/ICLSimple2DKernel.cpp
@@ -40,6 +40,5 @@ void ICLSimple2DKernel::run(const Window &window, cl::CommandQueue &queue)
add_2D_tensor_argument(idx, _input, slice);
add_2D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
+ } while (window.slide_window_slice_2D(slice));
}
diff --git a/src/core/CL/ICLSimple2DKernel.h b/src/core/CL/ICLSimple2DKernel.h
index 5246492401..97bc1e58c2 100644
--- a/src/core/CL/ICLSimple2DKernel.h
+++ b/src/core/CL/ICLSimple2DKernel.h
@@ -37,5 +37,5 @@ public:
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
};
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_ICLSIMPLE2DKERNEL_H */
diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp
index fef1a86125..71d7d1f07b 100644
--- a/src/core/CL/ICLSimple3DKernel.cpp
+++ b/src/core/CL/ICLSimple3DKernel.cpp
@@ -42,6 +42,5 @@ void ICLSimple3DKernel::run(const Window &window, cl::CommandQueue &queue)
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ } while (window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/ICLSimple3DKernel.h b/src/core/CL/ICLSimple3DKernel.h
index ff0b274663..5071b6b339 100644
--- a/src/core/CL/ICLSimple3DKernel.h
+++ b/src/core/CL/ICLSimple3DKernel.h
@@ -39,5 +39,5 @@ public:
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
};
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_ICLSIMPLE3DKERNEL_H */
diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp
index d67fefdf71..c31db8355f 100644
--- a/src/core/CL/ICLSimpleKernel.cpp
+++ b/src/core/CL/ICLSimpleKernel.cpp
@@ -22,30 +22,35 @@
* SOFTWARE.
*/
#include "src/core/CL/ICLSimpleKernel.h"
+
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/helpers/WindowHelpers.h"
using namespace arm_compute;
-ICLSimpleKernel::ICLSimpleKernel()
- : _input(nullptr), _output(nullptr)
+ICLSimpleKernel::ICLSimpleKernel() : _input(nullptr), _output(nullptr)
{
}
-void ICLSimpleKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
+void ICLSimpleKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int num_elems_processed_per_iteration,
+ bool border_undefined,
+ const BorderSize &border_size)
{
_input = input;
_output = output;
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+ Window win =
+ calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win,
- AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+ update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
output_access);
output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
diff --git a/src/core/CL/ICLSimpleKernel.h b/src/core/CL/ICLSimpleKernel.h
index b35547a217..6afd7309aa 100644
--- a/src/core/CL/ICLSimpleKernel.h
+++ b/src/core/CL/ICLSimpleKernel.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -55,12 +56,16 @@ public:
* @param[in] border_undefined (Optional) True if the border mode is undefined. False if it's replicate or constant.
* @param[in] border_size (Optional) Size of the border.
*/
- void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
+ void configure(const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int num_elems_processed_per_iteration,
+ bool border_undefined = false,
+ const BorderSize &border_size = BorderSize());
protected:
const ICLTensor *_input;
ICLTensor *_output;
};
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_ICLSIMPLEKERNEL_H */
diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp
index b541bff04a..0771db7f50 100644
--- a/src/core/CL/ICLTensor.cpp
+++ b/src/core/CL/ICLTensor.cpp
@@ -27,8 +27,7 @@
using namespace arm_compute;
-ICLTensor::ICLTensor()
- : _mapping(nullptr)
+ICLTensor::ICLTensor() : _mapping(nullptr)
{
}
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index b092dfb4e2..35421d025e 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -36,11 +36,7 @@
namespace arm_compute
{
-CLSymbols::CLSymbols() noexcept(false)
- : _loaded(
-{
- false, false
-})
+CLSymbols::CLSymbols() noexcept(false) : _loaded({false, false})
{
}
@@ -52,9 +48,9 @@ CLSymbols &CLSymbols::get()
bool CLSymbols::load_default()
{
- static const std::vector<std::string> libraries_filenames{ "libOpenCL.so", "libGLES_mali.so", "libmali.so" };
+ static const std::vector<std::string> libraries_filenames{"libOpenCL.so", "libGLES_mali.so", "libmali.so"};
- if(_loaded.first)
+ if (_loaded.first)
{
return _loaded.second;
}
@@ -62,34 +58,32 @@ bool CLSymbols::load_default()
// Indicate that default loading has been tried
_loaded.first = true;
- if(load(libraries_filenames, /* use_loader */ false))
+ if (load(libraries_filenames, /* use_loader */ false))
{
- ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from shared library");
+ ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr,
+ "Failed to load OpenCL symbols from shared library");
return true;
}
#ifdef __ANDROID__
// When running in NDK environment, the above libraries are not accessible.
- static const std::vector<std::string> android_libraries_filenames{ "libOpenCL-pixel.so", "libOpenCL-car.so" };
+ static const std::vector<std::string> android_libraries_filenames{"libOpenCL-pixel.so", "libOpenCL-car.so"};
- if(load(android_libraries_filenames, /* use_loader */ true))
+ if (load(android_libraries_filenames, /* use_loader */ true))
{
- ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from android shared library");
+ ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr,
+ "Failed to load OpenCL symbols from android shared library");
return true;
}
#endif // __ANDROID__
// If not returned till here then libraries not found
std::stringstream ss;
- std::for_each(libraries_filenames.begin(), libraries_filenames.end(), [&ss](const std::string & s)
- {
- ss << s << " ";
- });
+ std::for_each(libraries_filenames.begin(), libraries_filenames.end(),
+ [&ss](const std::string &s) { ss << s << " "; });
#ifdef __ANDROID__
- std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(), [&ss](const std::string & s)
- {
- ss << s << " ";
- });
+ std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(),
+ [&ss](const std::string &s) { ss << s << " "; });
#endif // __ANDROID__
std::cerr << "Couldn't find any of the following OpenCL library: " << ss.str() << std::endl;
return false;
@@ -99,15 +93,15 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u
{
void *handle = nullptr;
unsigned int index = 0;
- for(index = 0; index < libraries_filenames.size(); ++index)
+ for (index = 0; index < libraries_filenames.size(); ++index)
{
handle = dlopen(libraries_filenames[index].c_str(), RTLD_LAZY | RTLD_LOCAL);
- if(handle != nullptr)
+ if (handle != nullptr)
{
break;
}
}
- if(index == libraries_filenames.size())
+ if (index == libraries_filenames.size())
{
// Set status of loading to failed
_loaded.second = false;
@@ -115,22 +109,23 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u
}
#ifdef __ANDROID__
- typedef void* (*loadOpenCLPointer_t)(const char* name);
+ typedef void *(*loadOpenCLPointer_t)(const char *name);
loadOpenCLPointer_t loadOpenCLPointer;
- if (use_loader) {
+ if (use_loader)
+ {
typedef void (*enableOpenCL_t)();
- enableOpenCL_t enableOpenCL =
- reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL"));
+ enableOpenCL_t enableOpenCL = reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL"));
enableOpenCL();
- loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(
- dlsym(handle, "loadOpenCLPointer"));
- } else {
+ loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(dlsym(handle, "loadOpenCLPointer"));
+ }
+ else
+ {
loadOpenCLPointer = nullptr;
}
-#define LOAD_FUNCTION_PTR(func_name, _handle) \
- func_name##_ptr = reinterpret_cast<decltype(func_name) *>( use_loader ? \
- loadOpenCLPointer(#func_name) : dlsym(handle, #func_name));
+#define LOAD_FUNCTION_PTR(func_name, _handle) \
+ func_name##_ptr = reinterpret_cast<decltype(func_name) *>(use_loader ? loadOpenCLPointer(#func_name) \
+ : dlsym(handle, #func_name));
#else /* __ANDROID__ */
(void)use_loader; // Avoid unused warning
#define LOAD_FUNCTION_PTR(func_name, handle) \
@@ -234,12 +229,11 @@ bool opencl_is_available()
}
} // namespace arm_compute
-cl_int clEnqueueMarker(cl_command_queue command_queue,
- cl_event *event)
+cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *event)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clEnqueueMarker_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_queue, event);
}
@@ -249,12 +243,11 @@ cl_int clEnqueueMarker(cl_command_queue command_queue,
}
}
-cl_int clWaitForEvents(cl_uint num_events,
- const cl_event *event_list)
+cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clWaitForEvents_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(num_events, event_list);
}
@@ -264,12 +257,18 @@ cl_int clWaitForEvents(cl_uint num_events,
}
}
-cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void *svm_ptr,
- size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+cl_int clEnqueueSVMMap(cl_command_queue command_queue,
+ cl_bool blocking_map,
+ cl_map_flags flags,
+ void *svm_ptr,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clEnqueueSVMMap_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event);
}
@@ -279,12 +278,15 @@ cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_
}
}
-cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list, cl_event *event)
+cl_int clEnqueueSVMUnmap(cl_command_queue command_queue,
+ void *svm_ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clEnqueueSVMUnmap_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event);
}
@@ -298,7 +300,7 @@ void *clSVMAlloc(cl_context context, cl_svm_mem_flags_arm flags, size_t size, cl
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clSVMAlloc_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(context, flags, size, alignment);
}
@@ -312,7 +314,7 @@ void clSVMFree(cl_context context, void *svm_pointer)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clSVMFree_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
func(context, svm_pointer);
}
@@ -326,7 +328,7 @@ cl_int clGetContextInfo(cl_context context,
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetContextInfo_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(context, param_name, param_value_size, param_value, param_value_size_ret);
}
@@ -343,7 +345,7 @@ cl_command_queue clCreateCommandQueue(cl_context context,
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clCreateCommandQueue_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(context, device, properties, errcode_ret);
}
@@ -360,7 +362,7 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context c
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clCreateCommandQueueWithProperties_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(context, device, properties, errcode_ret);
}
@@ -370,17 +372,16 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context c
}
}
-cl_context clCreateContext(
- const cl_context_properties *properties,
- cl_uint num_devices,
- const cl_device_id *devices,
- void (*pfn_notify)(const char *, const void *, size_t, void *),
- void *user_data,
- cl_int *errcode_ret)
+cl_context clCreateContext(const cl_context_properties *properties,
+ cl_uint num_devices,
+ const cl_device_id *devices,
+ void (*pfn_notify)(const char *, const void *, size_t, void *),
+ void *user_data,
+ cl_int *errcode_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clCreateContext_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(properties, num_devices, devices, pfn_notify, user_data, errcode_ret);
}
@@ -398,7 +399,7 @@ cl_context clCreateContextFromType(const cl_context_properties *properties,
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clCreateContextFromType_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(properties, device_type, pfn_notify, user_data, errcode_ret);
}
@@ -408,17 +409,16 @@ cl_context clCreateContextFromType(const cl_context_properties *properties,
}
}
-cl_int clBuildProgram(
- cl_program program,
- cl_uint num_devices,
- const cl_device_id *device_list,
- const char *options,
- void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
- void *user_data)
+cl_int clBuildProgram(cl_program program,
+ cl_uint num_devices,
+ const cl_device_id *device_list,
+ const char *options,
+ void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+ void *user_data)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clBuildProgram_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(program, num_devices, device_list, options, pfn_notify, user_data);
}
@@ -428,22 +428,22 @@ cl_int clBuildProgram(
}
}
-cl_int clEnqueueNDRangeKernel(
- cl_command_queue command_queue,
- cl_kernel kernel,
- cl_uint work_dim,
- const size_t *global_work_offset,
- const size_t *global_work_size,
- const size_t *local_work_size,
- cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list,
- cl_event *event)
+cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint work_dim,
+ const size_t *global_work_offset,
+ const size_t *global_work_size,
+ const size_t *local_work_size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
- return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
+ return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size,
+ num_events_in_wait_list, event_wait_list, event);
}
else
{
@@ -451,15 +451,11 @@ cl_int clEnqueueNDRangeKernel(
}
}
-cl_int clSetKernelArg(
- cl_kernel kernel,
- cl_uint arg_index,
- size_t arg_size,
- const void *arg_value)
+cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clSetKernelArg_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(kernel, arg_index, arg_size, arg_value);
}
@@ -473,7 +469,7 @@ cl_int clRetainMemObject(cl_mem memobj)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clRetainMemObject_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(memobj);
}
@@ -487,7 +483,7 @@ cl_int clReleaseMemObject(cl_mem memobj)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clReleaseMemObject_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(memobj);
}
@@ -497,17 +493,16 @@ cl_int clReleaseMemObject(cl_mem memobj)
}
}
-cl_int clEnqueueUnmapMemObject(
- cl_command_queue command_queue,
- cl_mem memobj,
- void *mapped_ptr,
- cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list,
- cl_event *event)
+cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue,
+ cl_mem memobj,
+ void *mapped_ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
}
@@ -521,7 +516,7 @@ cl_int clRetainCommandQueue(cl_command_queue command_queue)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clRetainCommandQueue_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_queue);
}
@@ -535,7 +530,7 @@ cl_int clReleaseContext(cl_context context)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clReleaseContext_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(context);
}
@@ -548,7 +543,7 @@ cl_int clReleaseEvent(cl_event event)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clReleaseEvent_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(event);
}
@@ -558,22 +553,22 @@ cl_int clReleaseEvent(cl_event event)
}
}
-cl_int clEnqueueWriteBuffer(
- cl_command_queue command_queue,
- cl_mem buffer,
- cl_bool blocking_write,
- size_t offset,
- size_t size,
- const void *ptr,
- cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list,
- cl_event *event)
+cl_int clEnqueueWriteBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ size_t offset,
+ size_t size,
+ const void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
- return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+ return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+ event);
}
else
{
@@ -581,22 +576,22 @@ cl_int clEnqueueWriteBuffer(
}
}
-cl_int clEnqueueReadBuffer(
- cl_command_queue command_queue,
- cl_mem buffer,
- cl_bool blocking_read,
- size_t offset,
- size_t size,
- void *ptr,
- cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list,
- cl_event *event)
+cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ size_t offset,
+ size_t size,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
- return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+ return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+ event);
}
else
{
@@ -604,17 +599,16 @@ cl_int clEnqueueReadBuffer(
}
}
-cl_int clGetProgramBuildInfo(
- cl_program program,
- cl_device_id device,
- cl_program_build_info param_name,
- size_t param_value_size,
- void *param_value,
- size_t *param_value_size_ret)
+cl_int clGetProgramBuildInfo(cl_program program,
+ cl_device_id device,
+ cl_program_build_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
}
@@ -628,7 +622,7 @@ cl_int clRetainProgram(cl_program program)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clRetainProgram_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(program);
}
@@ -638,27 +632,27 @@ cl_int clRetainProgram(cl_program program)
}
}
-void *clEnqueueMapBuffer(
- cl_command_queue command_queue,
- cl_mem buffer,
- cl_bool blocking_map,
- cl_map_flags map_flags,
- size_t offset,
- size_t size,
- cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list,
- cl_event *event,
- cl_int *errcode_ret)
+void *clEnqueueMapBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ size_t offset,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event,
+ cl_int *errcode_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
- return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+ return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list,
+ event_wait_list, event, errcode_ret);
}
else
{
- if(errcode_ret != nullptr)
+ if (errcode_ret != nullptr)
{
*errcode_ret = CL_OUT_OF_RESOURCES;
}
@@ -670,7 +664,7 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_queue);
}
@@ -680,24 +674,23 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue)
}
}
-cl_program clCreateProgramWithBinary(
- cl_context context,
- cl_uint num_devices,
- const cl_device_id *device_list,
- const size_t *lengths,
- const unsigned char **binaries,
- cl_int *binary_status,
- cl_int *errcode_ret)
+cl_program clCreateProgramWithBinary(cl_context context,
+ cl_uint num_devices,
+ const cl_device_id *device_list,
+ const size_t *lengths,
+ const unsigned char **binaries,
+ cl_int *binary_status,
+ cl_int *errcode_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
}
else
{
- if(errcode_ret != nullptr)
+ if (errcode_ret != nullptr)
{
*errcode_ret = CL_OUT_OF_RESOURCES;
}
@@ -709,7 +702,7 @@ cl_int clRetainContext(cl_context context)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clRetainContext_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(context);
}
@@ -723,7 +716,7 @@ cl_int clReleaseProgram(cl_program program)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clReleaseProgram_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(program);
}
@@ -737,7 +730,7 @@ cl_int clFlush(cl_command_queue command_queue)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clFlush_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_queue);
}
@@ -751,7 +744,7 @@ cl_int clFinish(cl_command_queue command_queue)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clFinish_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_queue);
}
@@ -761,16 +754,15 @@ cl_int clFinish(cl_command_queue command_queue)
}
}
-cl_int clGetProgramInfo(
- cl_program program,
- cl_program_info param_name,
- size_t param_value_size,
- void *param_value,
- size_t *param_value_size_ret)
+cl_int clGetProgramInfo(cl_program program,
+ cl_program_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetProgramInfo_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(program, param_name, param_value_size, param_value, param_value_size_ret);
}
@@ -780,20 +772,17 @@ cl_int clGetProgramInfo(
}
}
-cl_kernel clCreateKernel(
- cl_program program,
- const char *kernel_name,
- cl_int *errcode_ret)
+cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clCreateKernel_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(program, kernel_name, errcode_ret);
}
else
{
- if(errcode_ret != nullptr)
+ if (errcode_ret != nullptr)
{
*errcode_ret = CL_OUT_OF_RESOURCES;
}
@@ -805,7 +794,7 @@ cl_int clRetainKernel(cl_kernel kernel)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clRetainKernel_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(kernel);
}
@@ -815,22 +804,17 @@ cl_int clRetainKernel(cl_kernel kernel)
}
}
-cl_mem clCreateBuffer(
- cl_context context,
- cl_mem_flags flags,
- size_t size,
- void *host_ptr,
- cl_int *errcode_ret)
+cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clCreateBuffer_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(context, flags, size, host_ptr, errcode_ret);
}
else
{
- if(errcode_ret != nullptr)
+ if (errcode_ret != nullptr)
{
*errcode_ret = CL_OUT_OF_RESOURCES;
}
@@ -839,21 +823,17 @@ cl_mem clCreateBuffer(
}
cl_program clCreateProgramWithSource(
- cl_context context,
- cl_uint count,
- const char **strings,
- const size_t *lengths,
- cl_int *errcode_ret)
+ cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(context, count, strings, lengths, errcode_ret);
}
else
{
- if(errcode_ret != nullptr)
+ if (errcode_ret != nullptr)
{
*errcode_ret = CL_OUT_OF_RESOURCES;
}
@@ -865,7 +845,7 @@ cl_int clReleaseKernel(cl_kernel kernel)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clReleaseKernel_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(kernel);
}
@@ -878,12 +858,12 @@ cl_int clReleaseKernel(cl_kernel kernel)
cl_int clGetDeviceIDs(cl_platform_id platform,
cl_device_type device_type,
cl_uint num_entries,
- cl_device_id *devices,
+ cl_device_id *devices,
cl_uint *num_devices)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetDeviceIDs_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(platform, device_type, num_entries, devices, num_devices);
}
@@ -901,7 +881,7 @@ cl_int clGetDeviceInfo(cl_device_id device,
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetDeviceInfo_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(device, param_name, param_value_size, param_value, param_value_size_ret);
}
@@ -911,15 +891,12 @@ cl_int clGetDeviceInfo(cl_device_id device,
}
}
-cl_int clGetMemObjectInfo(cl_mem memobj,
- cl_mem_info param_name,
- size_t param_value_size,
- void *param_value,
- size_t *param_value_size_ret)
+cl_int clGetMemObjectInfo(
+ cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetMemObjectInfo_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(memobj, param_name, param_value_size, param_value, param_value_size_ret);
}
@@ -933,7 +910,7 @@ cl_int clRetainEvent(cl_event event)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clRetainEvent_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(event);
}
@@ -951,7 +928,7 @@ cl_int clGetPlatformInfo(cl_platform_id platform,
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetPlatformInfo_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(platform, param_name, param_value_size, param_value, param_value_size_ret);
}
@@ -965,7 +942,7 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetPlatformIDs_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(num_entries, platforms, num_platforms);
}
@@ -975,17 +952,16 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint
}
}
-cl_int
-clGetKernelWorkGroupInfo(cl_kernel kernel,
- cl_device_id device,
- cl_kernel_work_group_info param_name,
- size_t param_value_size,
- void *param_value,
- size_t *param_value_size_ret)
+cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
+ cl_device_id device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
}
@@ -995,16 +971,15 @@ clGetKernelWorkGroupInfo(cl_kernel kernel,
}
}
-cl_int
-clGetCommandQueueInfo(cl_command_queue command_queue,
- cl_command_queue_info param_name,
- size_t param_value_size,
- void *param_value,
- size_t *param_value_size_ret)
+cl_int clGetCommandQueueInfo(cl_command_queue command_queue,
+ cl_command_queue_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret);
}
@@ -1014,16 +989,15 @@ clGetCommandQueueInfo(cl_command_queue command_queue,
}
}
-cl_int
-clGetKernelInfo(cl_kernel kernel,
- cl_kernel_info param_name,
- size_t param_value_size,
- void *param_value,
- size_t *param_value_size_ret)
+cl_int clGetKernelInfo(cl_kernel kernel,
+ cl_kernel_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(kernel, param_name, param_value_size, param_value, param_value_size_ret);
}
@@ -1033,16 +1007,15 @@ clGetKernelInfo(cl_kernel kernel,
}
}
-cl_int
-clGetEventProfilingInfo(cl_event event,
- cl_profiling_info param_name,
- size_t param_value_size,
- void *param_value,
- size_t *param_value_size_ret)
+cl_int clGetEventProfilingInfo(cl_event event,
+ cl_profiling_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(event, param_name, param_value_size, param_value, param_value_size_ret);
}
@@ -1052,23 +1025,22 @@ clGetEventProfilingInfo(cl_event event,
}
}
-cl_mem
-clCreateImage(cl_context context,
- cl_mem_flags flags,
- const cl_image_format *image_format,
- const cl_image_desc *image_desc,
- void *host_ptr,
- cl_int *errcode_ret)
+cl_mem clCreateImage(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format *image_format,
+ const cl_image_desc *image_desc,
+ void *host_ptr,
+ cl_int *errcode_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clCreateImage_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(context, flags, image_format, image_desc, host_ptr, errcode_ret);
}
else
{
- if(errcode_ret != nullptr)
+ if (errcode_ret != nullptr)
{
*errcode_ret = CL_OUT_OF_RESOURCES;
}
@@ -1076,14 +1048,12 @@ clCreateImage(cl_context context,
}
}
-cl_int clSetKernelExecInfo(cl_kernel kernel,
- cl_kernel_exec_info param_name,
- size_t param_value_size,
- const void *param_value)
+cl_int
+clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void *param_value)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(kernel, param_name, param_value_size, param_value);
}
@@ -1093,22 +1063,21 @@ cl_int clSetKernelExecInfo(cl_kernel kernel,
}
}
-cl_command_buffer_khr clCreateCommandBufferKHR(
- cl_uint num_queues,
- const cl_command_queue* queues,
- const cl_command_buffer_properties_khr* properties,
- cl_int* errcode_ret)
+cl_command_buffer_khr clCreateCommandBufferKHR(cl_uint num_queues,
+ const cl_command_queue *queues,
+ const cl_command_buffer_properties_khr *properties,
+ cl_int *errcode_ret)
{
arm_compute::CLSymbols::get().load_default();
const auto func = arm_compute::CLSymbols::get().clCreateCommandBufferKHR_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(num_queues, queues, properties, errcode_ret);
}
else
{
- if(errcode_ret != nullptr)
+ if (errcode_ret != nullptr)
{
*errcode_ret = CL_INVALID_OPERATION;
}
@@ -1122,7 +1091,7 @@ cl_int clFinalizeCommandBufferKHR(cl_command_buffer_khr command_buffer)
arm_compute::CLSymbols::get().load_default();
const auto func = arm_compute::CLSymbols::get().clFinalizeCommandBufferKHR_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_buffer);
}
@@ -1137,7 +1106,7 @@ cl_int clRetainCommandBufferKHR(cl_command_buffer_khr command_buffer)
arm_compute::CLSymbols::get().load_default();
const auto func = arm_compute::CLSymbols::get().clRetainCommandBufferKHR_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_buffer);
}
@@ -1152,7 +1121,7 @@ cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer)
arm_compute::CLSymbols::get().load_default();
const auto func = arm_compute::CLSymbols::get().clReleaseCommandBufferKHR_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_buffer);
}
@@ -1162,18 +1131,17 @@ cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer)
}
}
-cl_int clEnqueueCommandBufferKHR(
- cl_uint num_queues,
- cl_command_queue* queues,
- cl_command_buffer_khr command_buffer,
- cl_uint num_events_in_wait_list,
- const cl_event* event_wait_list,
- cl_event* event)
+cl_int clEnqueueCommandBufferKHR(cl_uint num_queues,
+ cl_command_queue *queues,
+ cl_command_buffer_khr command_buffer,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
arm_compute::CLSymbols::get().load_default();
const auto func = arm_compute::CLSymbols::get().clEnqueueCommandBufferKHR_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(num_queues, queues, command_buffer, num_events_in_wait_list, event_wait_list, event);
}
@@ -1183,27 +1151,26 @@ cl_int clEnqueueCommandBufferKHR(
}
}
-
-cl_int clCommandNDRangeKernelKHR(
- cl_command_buffer_khr command_buffer,
- cl_command_queue command_queue,
- const cl_ndrange_kernel_command_properties_khr* properties,
- cl_kernel kernel,
- cl_uint work_dim,
- const size_t* global_work_offset,
- const size_t* global_work_size,
- const size_t* local_work_size,
- cl_uint num_sync_points_in_wait_list,
- const cl_sync_point_khr* sync_point_wait_list,
- cl_sync_point_khr* sync_point,
- cl_mutable_command_khr* mutable_handle)
+cl_int clCommandNDRangeKernelKHR(cl_command_buffer_khr command_buffer,
+ cl_command_queue command_queue,
+ const cl_ndrange_kernel_command_properties_khr *properties,
+ cl_kernel kernel,
+ cl_uint work_dim,
+ const size_t *global_work_offset,
+ const size_t *global_work_size,
+ const size_t *local_work_size,
+ cl_uint num_sync_points_in_wait_list,
+ const cl_sync_point_khr *sync_point_wait_list,
+ cl_sync_point_khr *sync_point,
+ cl_mutable_command_khr *mutable_handle)
{
arm_compute::CLSymbols::get().load_default();
const auto func = arm_compute::CLSymbols::get().clCommandNDRangeKernelKHR_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
- return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle);
+ return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size,
+ local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle);
}
else
{
@@ -1211,14 +1178,13 @@ cl_int clCommandNDRangeKernelKHR(
}
}
-cl_int clUpdateMutableCommandsKHR(
- cl_command_buffer_khr command_buffer,
- const cl_mutable_base_config_khr* mutable_config)
+cl_int clUpdateMutableCommandsKHR(cl_command_buffer_khr command_buffer,
+ const cl_mutable_base_config_khr *mutable_config)
{
arm_compute::CLSymbols::get().load_default();
const auto func = arm_compute::CLSymbols::get().clUpdateMutableCommandsKHR_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(command_buffer, mutable_config);
}
@@ -1228,23 +1194,22 @@ cl_int clUpdateMutableCommandsKHR(
}
}
-cl_mem
-clImportMemoryARM(cl_context context,
- cl_mem_flags flags,
- const cl_import_properties_arm *properties,
- void *memory,
- size_t size,
- cl_int *errcode_ret)
+cl_mem clImportMemoryARM(cl_context context,
+ cl_mem_flags flags,
+ const cl_import_properties_arm *properties,
+ void *memory,
+ size_t size,
+ cl_int *errcode_ret)
{
arm_compute::CLSymbols::get().load_default();
auto func = arm_compute::CLSymbols::get().clImportMemoryARM_ptr;
- if(func != nullptr)
+ if (func != nullptr)
{
return func(context, flags, properties, memory, size, errcode_ret);
}
else
{
- if(errcode_ret != nullptr)
+ if (errcode_ret != nullptr)
{
*errcode_ret = CL_OUT_OF_RESOURCES;
}
diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h
index 3f93c8d6fc..02faae2369 100644
--- a/src/core/CL/cl_kernels/activation_float_helpers.h
+++ b/src/core/CL/cl_kernels/activation_float_helpers.h
@@ -31,7 +31,8 @@
#endif // GPU_ARCH == GPU_ARCH_BIFROST
// Hard-Swish
-#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
+#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+ (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
// Logistic Activation
#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
@@ -49,13 +50,16 @@
#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
// Leaky RELU Activation
-#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+ ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
// Soft RELU Activation
#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
// ELU Activation
-#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
+#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+ (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, \
+ (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
// Absolute Activation
#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
@@ -70,7 +74,8 @@
#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
// GELU Activation
-#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
+#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+ (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
// Identity Activation
#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
diff --git a/src/core/CL/cl_kernels/activation_quant_helpers.h b/src/core/CL/cl_kernels/activation_quant_helpers.h
index c420578546..c758ff1278 100644
--- a/src/core/CL/cl_kernels/activation_quant_helpers.h
+++ b/src/core/CL/cl_kernels/activation_quant_helpers.h
@@ -60,17 +60,17 @@ inline TYPE identiy_op(TYPE x)
}
#define ACTIVATION_OP2(op, x) op##_op(x)
-#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
#if defined(S1_VAL) && defined(S2_VAL)
#if defined(O1_VAL) && defined(O2_VAL)
#define PERFORM_ACTIVATION_QUANT(act, data) \
({ \
data = ACTIVATION_OP(act, data); \
- \
+ \
VEC_DATA_TYPE(float, VEC_SIZE) \
fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \
- \
+ \
fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
data = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
})
@@ -78,17 +78,14 @@ inline TYPE identiy_op(TYPE x)
#define PERFORM_ACTIVATION_QUANT(act, data) \
({ \
data = ACTIVATION_OP(act, data); \
- \
+ \
VEC_DATA_TYPE(float, VEC_SIZE) \
fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \
- \
+ \
fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL)); \
data = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
})
#endif /* defined(O1_VAL) && defined(O2_VAL) */
#else /* defined(S1_VAL) && defined(S2_VAL) */
-#define PERFORM_ACTIVATION_QUANT(act, data) \
- ({ \
- data = ACTIVATION_OP(act, data); \
- })
+#define PERFORM_ACTIVATION_QUANT(act, data) ({ data = ACTIVATION_OP(act, data); })
#endif /* defined(S1_VAL) && defined(S2_VAL) */
diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index 0e938cb668..4bef02314f 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h
@@ -34,14 +34,14 @@
*
*/
#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
-#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
+#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
// offset == 0
-#define scalar_access_0_1(x) ((x).s0)
-#define scalar_access_0_2(x) ((x).s01)
-#define scalar_access_0_3(x) ((x).s012)
-#define scalar_access_0_4(x) ((x).s0123)
-#define scalar_access_0_8(x) ((x).s01234567)
+#define scalar_access_0_1(x) ((x).s0)
+#define scalar_access_0_2(x) ((x).s01)
+#define scalar_access_0_3(x) ((x).s012)
+#define scalar_access_0_4(x) ((x).s0123)
+#define scalar_access_0_8(x) ((x).s01234567)
#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
// offset == 1
@@ -100,8 +100,7 @@
* @param[in] Z The z-axis offset vector
* @{
*/
-#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
- ({})
+#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) ({})
#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
@@ -186,8 +185,10 @@
* @param[in] Z The z-axis offset vector
* @{
*/
-#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
-#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
/** @} */ // end of group LOAD_TENSOR
/** Load 2D tensor (consecutive rows and columns) with Z offset.
@@ -202,8 +203,7 @@
* @param[in] Z The z-axis offset vector
* @{
*/
-#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
- ({})
+#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) ({})
#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
@@ -279,8 +279,10 @@
* @param[in] Z The z-axis offset vector
* @{
*/
-#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
/** @}*/ // end of group LOAD_TENSOR_M0XN0
/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -395,8 +397,10 @@
* @param[in] Z The z-axis offset vector
* @{
*/
-#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
-#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
/** @} */ // end of group LOAD_BLOCK
/** Partially load the 0 to (n-1)th rows of the given variables
@@ -517,8 +521,10 @@
* @param[in] Z The offset in z-axis direction
* @{
*/
-#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
-#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
/** Load a block that can be partial in both x and y dimensions
*
* @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
@@ -541,22 +547,23 @@
* @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
* @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
*/
-#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \
- { \
- LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
- } \
- else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \
- { \
- LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
- } \
- else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \
- { \
- LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
- } \
- else \
- { \
- LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
+#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+ PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+ if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \
+ { \
+ LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
+ } \
+ else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \
+ { \
+ LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
+ } \
+ else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \
+ { \
+ LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
+ } \
+ else \
+ { \
+ LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
}
/** Load a block that can only be partial in x but not y.
*
@@ -578,14 +585,15 @@
* @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
* @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
*/
-#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
- if(!(PARTIAL_COND_X)) \
- { \
- LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
- } \
- else \
- { \
- LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
+#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, \
+ PARTIAL_COND_X) \
+ if (!(PARTIAL_COND_X)) \
+ { \
+ LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
+ } \
+ else \
+ { \
+ LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
}
/** Load a block that can only be partial in y but not x.
*
@@ -607,14 +615,15 @@
* @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
* @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
*/
-#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
- if(!(PARTIAL_COND_Y)) \
- { \
- LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
- } \
- else \
- { \
- LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
+#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+ PARTIAL_COND_Y) \
+ if (!(PARTIAL_COND_Y)) \
+ { \
+ LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
+ } \
+ else \
+ { \
+ LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
}
/** @} */ // end of group LOAD_BLOCK_PARTIAL
/** Boundary-aware GeMM block load
@@ -676,28 +685,33 @@
*/
#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
// Case1: No partial blocks in either x or y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+ PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
// Case2: Partial blocks in y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+ PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
// Case3: Partial blocks in x
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+ PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
// Case4: Partial blocks in both x and y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
- LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+ PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
+ LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+ PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
-#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
/** @} */ // end of group LOAD_BLOCK_BOUNDARY_AWARE
/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -795,8 +809,10 @@
* @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
* @{
*/
-#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
-#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+ LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+ LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
/** @} */ // end of group LOAD_TEXTURE2D
/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded.
@@ -815,7 +831,7 @@
#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##0; \
- if(Y_MASK##0 != 0) \
+ if (Y_MASK##0 != 0) \
BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
else \
BASENAME##0 = 0;
@@ -824,7 +840,7 @@
LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##1; \
- if(Y_MASK##1 != 0) \
+ if (Y_MASK##1 != 0) \
BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
else \
BASENAME##1 = 0;
@@ -833,7 +849,7 @@
LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##2; \
- if(Y_MASK##2 != 0) \
+ if (Y_MASK##2 != 0) \
BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
else \
BASENAME##2 = 0;
@@ -842,7 +858,7 @@
LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##3; \
- if(Y_MASK##3 != 0) \
+ if (Y_MASK##3 != 0) \
BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
else \
BASENAME##3 = 0;
@@ -851,7 +867,7 @@
LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##4; \
- if(Y_MASK##4 != 0) \
+ if (Y_MASK##4 != 0) \
BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
else \
BASENAME##4 = 0;
@@ -860,7 +876,7 @@
LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##5; \
- if(Y_MASK##5 != 0) \
+ if (Y_MASK##5 != 0) \
BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
else \
BASENAME##5 = 0;
@@ -869,7 +885,7 @@
LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##6; \
- if(Y_MASK##6 != 0) \
+ if (Y_MASK##6 != 0) \
BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
else \
BASENAME##6 = 0;
@@ -878,7 +894,7 @@
LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##7; \
- if(Y_MASK##7 != 0) \
+ if (Y_MASK##7 != 0) \
BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
else \
BASENAME##7 = 0;
@@ -887,7 +903,7 @@
LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##8; \
- if(Y_MASK##8 != 0) \
+ if (Y_MASK##8 != 0) \
BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
else \
BASENAME##8 = 0;
@@ -896,7 +912,7 @@
LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##9; \
- if(Y_MASK##9 != 0) \
+ if (Y_MASK##9 != 0) \
BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
else \
BASENAME##9 = 0;
@@ -905,7 +921,7 @@
LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##A; \
- if(Y_MASK##A != 0) \
+ if (Y_MASK##A != 0) \
BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
else \
BASENAME##A = 0;
@@ -914,7 +930,7 @@
LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##B; \
- if(Y_MASK##B != 0) \
+ if (Y_MASK##B != 0) \
BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
else \
BASENAME##B = 0;
@@ -923,7 +939,7 @@
LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##C; \
- if(Y_MASK##C != 0) \
+ if (Y_MASK##C != 0) \
BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
else \
BASENAME##C = 0;
@@ -932,7 +948,7 @@
LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##D; \
- if(Y_MASK##D != 0) \
+ if (Y_MASK##D != 0) \
BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
else \
BASENAME##D = 0;
@@ -941,7 +957,7 @@
LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##E; \
- if(Y_MASK##E != 0) \
+ if (Y_MASK##E != 0) \
BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
else \
BASENAME##E = 0;
@@ -950,7 +966,7 @@
LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
VEC_DATA_TYPE(DATA_TYPE, N0) \
BASENAME##F; \
- if(Y_MASK##F != 0) \
+ if (Y_MASK##F != 0) \
BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
else \
BASENAME##F = 0;
@@ -976,8 +992,10 @@
* @param[in] Y_MASK The y-axis mask vector. If 0, forces BASENAMEn to 0
* @{
*/
-#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
-#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+ LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+ LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
/** @} */ // end of group LOAD_BLOCK_INDIRECT
/** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -1088,8 +1106,10 @@
* @param[in] STRIDE_Y The stride in y-axis direction
* @{
*/
-#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
-#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
/** @} */ // end of group LOAD_SCALAR_AS_VECTOR
/** Basic macros to calculate Z offset values from Z0 to Zn-1
@@ -1187,8 +1207,10 @@
* @param[in] STRIDE_Y The stride value in y-axis direction
* @{
*/
-#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
-#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+ CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+ CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
/** @} */ // end of group CALCULATE_Z_OFFSET
/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
@@ -1199,8 +1221,7 @@
* @param[in] SCALE The scale factor
* @{
*/
-#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
- BASENAME##0 *= (DATA_TYPE)SCALE;
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE;
#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
@@ -1275,7 +1296,7 @@
* @{
*/
#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
-#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
/** @} */ // end of group SCALE_BLOCK
/** Create a new vector containing the values at the given index for a set of given vectors
@@ -1287,8 +1308,7 @@
* @param[in] TYPE The data type of the destination vectors
* @{
*/
-#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
- TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
VEC_DATA_TYPE(TYPE, 2) \
BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
@@ -1297,13 +1317,20 @@
BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
VEC_DATA_TYPE(TYPE, 4) \
- BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
-#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
- VEC_DATA_TYPE(TYPE, 8) \
- BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
-#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
- VEC_DATA_TYPE(TYPE, 16) \
- BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
+ BASENAME##IDX_COL = \
+ (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 8) \
+ BASENAME##IDX_COL = \
+ (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+ (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 16) \
+ BASENAME##IDX_COL = \
+ (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+ (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, \
+ (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, \
+ (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
/** @} */ // end of group COLUMN_VECTORn
/** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector
@@ -1315,8 +1342,7 @@
* @param[in] TYPE The data type of the destination vectors
* @{
*/
-#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
- TYPE BASENAME##IDX_COL = (TYPE)((X##0));
+#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0));
#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
VEC_DATA_TYPE(TYPE, 2) \
BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
@@ -1329,9 +1355,10 @@
#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
VEC_DATA_TYPE(TYPE, 8) \
BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
-#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
- VEC_DATA_TYPE(TYPE, 16) \
- BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
+#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 16) \
+ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \
+ (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
/** @} */ // end of group COLUMN_VECTOR_SCALARn
/** Create transposed vectors of the given vectors
@@ -1343,8 +1370,7 @@
* @param[in] TYPE The data type of the transposed vectors
* @{
*/
-#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
- COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
+#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \
COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
@@ -1417,8 +1443,7 @@
* @param[in] BIAS The basename of the added variables
* @{
*/
-#define ADD_ROW_1(BASENAME, BIAS) \
- BASENAME##0 += BIAS##0;
+#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0;
#define ADD_ROW_2(BASENAME, BIAS) \
ADD_ROW_1(BASENAME, BIAS) \
@@ -1493,7 +1518,7 @@
* @{
*/
#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
-#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
/** @} */ // end of group ADD_BLOCK
/** Broadcast (add single value) to the each element of the destination variables
@@ -1503,8 +1528,7 @@
* @param[in] BIAS The variable containing the value to add
* @{
*/
-#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
- BASENAME##0 += BIAS;
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS;
#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
@@ -1578,7 +1602,7 @@
* @{
*/
#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
-#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
/** @} */ // end of group ADD_BLOCK_BROADCAST
/** Apply activation to the given variables
@@ -1668,8 +1692,10 @@
* @param[in] B_VAL Additional value required by the activation
* @{
*/
-#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
-#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
/** @} */ // end of group ACTIVATION_BLOCK
/** Apply convert_<data_type> to the given variables
@@ -1773,6 +1799,8 @@
* @param[in] BASENAME_DST The basename of the destination variables
* @{
*/
-#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
-#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
/** @} */ // end of group CONVERT_BLOCK
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index b2ceaf92f3..87a1875f93 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -81,11 +81,11 @@
* @return The reversed vector
* @{
*/
-#define REV1(x) ((x))
-#define REV2(x) ((x).s10)
-#define REV3(x) ((x).s210)
-#define REV4(x) ((x).s3210)
-#define REV8(x) ((x).s76543210)
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
#define REV16(x) ((x).sFEDCBA9876543210)
/** @} */ // end of group REVn
@@ -99,7 +99,7 @@
* @{
*/
#define REVERSE_STR(x, s) REV##s((x))
-#define REVERSE(x, s) REVERSE_STR(x, s)
+#define REVERSE(x, s) REVERSE_STR(x, s)
/** @} */ // end of group REVERSE
/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
@@ -138,16 +138,16 @@
#define ROT8_7(x) ((x).s12345670)
#define ROT8_8(x) ((x))
-#define ROT16_0(x) ((x))
-#define ROT16_1(x) ((x).sF0123456789ABCDE)
-#define ROT16_2(x) ((x).sEF0123456789ABCD)
-#define ROT16_3(x) ((x).sDEF0123456789ABC)
-#define ROT16_4(x) ((x).sCDEF0123456789AB)
-#define ROT16_5(x) ((x).sBCDEF0123456789A)
-#define ROT16_6(x) ((x).sABCDEF0123456789)
-#define ROT16_7(x) ((x).s9ABCDEF012345678)
-#define ROT16_8(x) ((x).s89ABCDEF01234567)
-#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
#define ROT16_10(x) ((x).s6789ABCDEF012345)
#define ROT16_11(x) ((x).s56789ABCDEF01234)
#define ROT16_12(x) ((x).s456789ABCDEF0123)
@@ -168,7 +168,7 @@
* @{
*/
#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
-#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
/** @} */ // end of group ROTATE
/** Creates a vector of size n filled with offset values corresponding to the location of each element.
@@ -179,11 +179,11 @@
* @return The vector filled with offset values
* @{
*/
-#define V_OFFS1(dt) (dt##1)(0)
-#define V_OFFS2(dt) (dt##2)(0, 1)
-#define V_OFFS3(dt) (dt##3)(0, 1, 2)
-#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
-#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS1(dt) (dt##1)(0)
+#define V_OFFS2(dt) (dt##2)(0, 1)
+#define V_OFFS3(dt) (dt##3)(0, 1, 2)
+#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
/** @} */ // end of group V_OFFSn
@@ -197,11 +197,11 @@
* @{
*/
#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
-#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
/** @} */ // end of group VEC_OFFS
#define VLOAD_STR(size) vload##size
-#define VLOAD(size) VLOAD_STR(size)
+#define VLOAD(size) VLOAD_STR(size)
/** Extended partial vload that correctly handles scalar values as well.
* Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops
@@ -219,23 +219,23 @@
* @{
*/
#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
-#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
+#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
#define NO_LOAD(data, offs, ptr) \
{ \
}
// Size == 1 (scalar)
-#define vload_partial_1_0 NO_LOAD
-#define vload_partial_1_1 vload1
-#define vload_partial_1_2 NO_LOAD
-#define vload_partial_1_3 NO_LOAD
-#define vload_partial_1_4 NO_LOAD
-#define vload_partial_1_5 NO_LOAD
-#define vload_partial_1_6 NO_LOAD
-#define vload_partial_1_7 NO_LOAD
-#define vload_partial_1_8 NO_LOAD
-#define vload_partial_1_9 NO_LOAD
+#define vload_partial_1_0 NO_LOAD
+#define vload_partial_1_1 vload1
+#define vload_partial_1_2 NO_LOAD
+#define vload_partial_1_3 NO_LOAD
+#define vload_partial_1_4 NO_LOAD
+#define vload_partial_1_5 NO_LOAD
+#define vload_partial_1_6 NO_LOAD
+#define vload_partial_1_7 NO_LOAD
+#define vload_partial_1_8 NO_LOAD
+#define vload_partial_1_9 NO_LOAD
#define vload_partial_1_10 NO_LOAD
#define vload_partial_1_11 NO_LOAD
#define vload_partial_1_12 NO_LOAD
@@ -244,16 +244,16 @@
#define vload_partial_1_15 NO_LOAD
#define vload_partial_1_16 NO_LOAD
// Size == 2
-#define vload_partial_2_0 NO_LOAD
-#define vload_partial_2_1 vload_partial_1
-#define vload_partial_2_2 vload_partial_2
-#define vload_partial_2_3 NO_LOAD
-#define vload_partial_2_4 NO_LOAD
-#define vload_partial_2_5 NO_LOAD
-#define vload_partial_2_6 NO_LOAD
-#define vload_partial_2_7 NO_LOAD
-#define vload_partial_2_8 NO_LOAD
-#define vload_partial_2_9 NO_LOAD
+#define vload_partial_2_0 NO_LOAD
+#define vload_partial_2_1 vload_partial_1
+#define vload_partial_2_2 vload_partial_2
+#define vload_partial_2_3 NO_LOAD
+#define vload_partial_2_4 NO_LOAD
+#define vload_partial_2_5 NO_LOAD
+#define vload_partial_2_6 NO_LOAD
+#define vload_partial_2_7 NO_LOAD
+#define vload_partial_2_8 NO_LOAD
+#define vload_partial_2_9 NO_LOAD
#define vload_partial_2_10 NO_LOAD
#define vload_partial_2_11 NO_LOAD
#define vload_partial_2_12 NO_LOAD
@@ -262,16 +262,16 @@
#define vload_partial_2_15 NO_LOAD
#define vload_partial_2_16 NO_LOAD
// Size == 3
-#define vload_partial_3_0 NO_LOAD
-#define vload_partial_3_1 vload_partial_1
-#define vload_partial_3_2 vload_partial_2
-#define vload_partial_3_3 vload_partial_3
-#define vload_partial_3_4 NO_LOAD
-#define vload_partial_3_5 NO_LOAD
-#define vload_partial_3_6 NO_LOAD
-#define vload_partial_3_7 NO_LOAD
-#define vload_partial_3_8 NO_LOAD
-#define vload_partial_3_9 NO_LOAD
+#define vload_partial_3_0 NO_LOAD
+#define vload_partial_3_1 vload_partial_1
+#define vload_partial_3_2 vload_partial_2
+#define vload_partial_3_3 vload_partial_3
+#define vload_partial_3_4 NO_LOAD
+#define vload_partial_3_5 NO_LOAD
+#define vload_partial_3_6 NO_LOAD
+#define vload_partial_3_7 NO_LOAD
+#define vload_partial_3_8 NO_LOAD
+#define vload_partial_3_9 NO_LOAD
#define vload_partial_3_10 NO_LOAD
#define vload_partial_3_11 NO_LOAD
#define vload_partial_3_12 NO_LOAD
@@ -280,16 +280,16 @@
#define vload_partial_3_15 NO_LOAD
#define vload_partial_3_16 NO_LOAD
// Size == 4
-#define vload_partial_4_0 NO_LOAD
-#define vload_partial_4_1 vload_partial_1
-#define vload_partial_4_2 vload_partial_2
-#define vload_partial_4_3 vload_partial_3
-#define vload_partial_4_4 vload_partial_4
-#define vload_partial_4_5 NO_LOAD
-#define vload_partial_4_6 NO_LOAD
-#define vload_partial_4_7 NO_LOAD
-#define vload_partial_4_8 NO_LOAD
-#define vload_partial_4_9 NO_LOAD
+#define vload_partial_4_0 NO_LOAD
+#define vload_partial_4_1 vload_partial_1
+#define vload_partial_4_2 vload_partial_2
+#define vload_partial_4_3 vload_partial_3
+#define vload_partial_4_4 vload_partial_4
+#define vload_partial_4_5 NO_LOAD
+#define vload_partial_4_6 NO_LOAD
+#define vload_partial_4_7 NO_LOAD
+#define vload_partial_4_8 NO_LOAD
+#define vload_partial_4_9 NO_LOAD
#define vload_partial_4_10 NO_LOAD
#define vload_partial_4_11 NO_LOAD
#define vload_partial_4_12 NO_LOAD
@@ -298,16 +298,16 @@
#define vload_partial_4_15 NO_LOAD
#define vload_partial_4_16 NO_LOAD
// Size == 8
-#define vload_partial_8_0 NO_LOAD
-#define vload_partial_8_1 vload_partial_1
-#define vload_partial_8_2 vload_partial_2
-#define vload_partial_8_3 vload_partial_3
-#define vload_partial_8_4 vload_partial_4
-#define vload_partial_8_5 vload_partial_5
-#define vload_partial_8_6 vload_partial_6
-#define vload_partial_8_7 vload_partial_7
-#define vload_partial_8_8 vload_partial_8
-#define vload_partial_8_9 NO_LOAD
+#define vload_partial_8_0 NO_LOAD
+#define vload_partial_8_1 vload_partial_1
+#define vload_partial_8_2 vload_partial_2
+#define vload_partial_8_3 vload_partial_3
+#define vload_partial_8_4 vload_partial_4
+#define vload_partial_8_5 vload_partial_5
+#define vload_partial_8_6 vload_partial_6
+#define vload_partial_8_7 vload_partial_7
+#define vload_partial_8_8 vload_partial_8
+#define vload_partial_8_9 NO_LOAD
#define vload_partial_8_10 NO_LOAD
#define vload_partial_8_11 NO_LOAD
#define vload_partial_8_12 NO_LOAD
@@ -316,16 +316,16 @@
#define vload_partial_8_15 NO_LOAD
#define vload_partial_8_16 NO_LOAD
// Size == 16
-#define vload_partial_16_0 NO_LOAD
-#define vload_partial_16_1 vload_partial_1
-#define vload_partial_16_2 vload_partial_2
-#define vload_partial_16_3 vload_partial_3
-#define vload_partial_16_4 vload_partial_4
-#define vload_partial_16_5 vload_partial_5
-#define vload_partial_16_6 vload_partial_6
-#define vload_partial_16_7 vload_partial_7
-#define vload_partial_16_8 vload_partial_8
-#define vload_partial_16_9 vload_partial_9
+#define vload_partial_16_0 NO_LOAD
+#define vload_partial_16_1 vload_partial_1
+#define vload_partial_16_2 vload_partial_2
+#define vload_partial_16_3 vload_partial_3
+#define vload_partial_16_4 vload_partial_4
+#define vload_partial_16_5 vload_partial_5
+#define vload_partial_16_6 vload_partial_6
+#define vload_partial_16_7 vload_partial_7
+#define vload_partial_16_8 vload_partial_8
+#define vload_partial_16_9 vload_partial_9
#define vload_partial_16_10 vload_partial_10
#define vload_partial_16_11 vload_partial_11
#define vload_partial_16_12 vload_partial_12
@@ -351,17 +351,13 @@
* @param[in] PTR The base pointer
* @{
*/
-#define vload_partial_1(DATA, OFFSET, PTR) \
- DATA.s0 = vload1(OFFSET, PTR);
+#define vload_partial_1(DATA, OFFSET, PTR) DATA.s0 = vload1(OFFSET, PTR);
-#define vload_partial_2(DATA, OFFSET, PTR) \
- DATA.s01 = vload2(OFFSET, PTR);
+#define vload_partial_2(DATA, OFFSET, PTR) DATA.s01 = vload2(OFFSET, PTR);
-#define vload_partial_3(DATA, OFFSET, PTR) \
- DATA.s012 = vload3(OFFSET, PTR);
+#define vload_partial_3(DATA, OFFSET, PTR) DATA.s012 = vload3(OFFSET, PTR);
-#define vload_partial_4(DATA, OFFSET, PTR) \
- DATA.s0123 = vload4(OFFSET, PTR);
+#define vload_partial_4(DATA, OFFSET, PTR) DATA.s0123 = vload4(OFFSET, PTR);
#define vload_partial_5(DATA, OFFSET, PTR) \
vload_partial_4(DATA.s0123, OFFSET, PTR); \
@@ -375,8 +371,7 @@
vload_partial_4(DATA.s0123, OFFSET, PTR); \
vload_partial_3(DATA.s456, OFFSET, PTR + 4);
-#define vload_partial_8(DATA, OFFSET, PTR) \
- DATA.s01234567 = vload8(OFFSET, PTR);
+#define vload_partial_8(DATA, OFFSET, PTR) DATA.s01234567 = vload8(OFFSET, PTR);
#define vload_partial_9(DATA, OFFSET, PTR) \
vload_partial_8(DATA.s01234567, OFFSET, PTR); \
@@ -406,13 +401,12 @@
vload_partial_8(DATA.s01234567, OFFSET, PTR); \
vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
-#define vload_partial_16(DATA, OFFSET, PTR) \
- DATA = vload16(OFFSET, PTR);
+#define vload_partial_16(DATA, OFFSET, PTR) DATA = vload16(OFFSET, PTR);
/** @} */ // end of groupd vload_partial_n
/** @} */ // end of groupd VLOAD_PARTIAL
-#define PIXEL_UNIT4 1
-#define PIXEL_UNIT8 2
+#define PIXEL_UNIT4 1
+#define PIXEL_UNIT8 2
#define PIXEL_UNIT16 4
/** Utility macro to convert a vector size in pixel unit.
@@ -425,27 +419,45 @@
* @{
*/
#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
-#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
+#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
-#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_floatx2(img, x_coord, y_coord) \
+ (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_floatx4(img, x_coord, y_coord) \
+ (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), \
+ read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
-#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_halfx2(img, x_coord, y_coord) \
+ (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_halfx4(img, x_coord, y_coord) \
+ (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), \
+ read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
-#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
-#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+#define write_image2d_floatx2(img, x_coord, y_coord, values) \
+ (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \
+ write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_floatx4(img, x_coord, y_coord, values) \
+ (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \
+ write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+ write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+ write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
-#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
-#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+#define write_image2d_halfx2(img, x_coord, y_coord, values) \
+ (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \
+ write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_halfx4(img, x_coord, y_coord, values) \
+ (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \
+ write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+ write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+ write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
/** Utility macro to read a 2D OpenCL image object.
@@ -462,7 +474,7 @@
* @{
*/
#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
-#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
+#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
/** @} */
/** Utility macro to write a 2D OpenCL image object.
@@ -478,26 +490,28 @@
*
* @{
*/
-#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
-#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) \
+ write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) \
+ WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
/** @} */
#define VSTORE_STR(size) vstore##size
-#define VSTORE(size) VSTORE_STR(size)
+#define VSTORE(size) VSTORE_STR(size)
-#define float1 float
-#define half1 half
-#define char1 char
-#define uchar1 uchar
-#define short1 short
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
#define ushort1 ushort
-#define int1 int
-#define uint1 uint
-#define long1 long
-#define ulong1 ulong
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
#define double1 double
-#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
/** Extended partial vstore that correctly handles scalar values as well.
@@ -516,23 +530,23 @@
* @{
*/
#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
-#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
+#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
#define NO_STORE(data, offs, ptr) \
{ \
}
// Size == 1 (scalar)
-#define vstore_partial_1_0 NO_STORE
-#define vstore_partial_1_1 vstore1
-#define vstore_partial_1_2 NO_STORE
-#define vstore_partial_1_3 NO_STORE
-#define vstore_partial_1_4 NO_STORE
-#define vstore_partial_1_5 NO_STORE
-#define vstore_partial_1_6 NO_STORE
-#define vstore_partial_1_7 NO_STORE
-#define vstore_partial_1_8 NO_STORE
-#define vstore_partial_1_9 NO_STORE
+#define vstore_partial_1_0 NO_STORE
+#define vstore_partial_1_1 vstore1
+#define vstore_partial_1_2 NO_STORE
+#define vstore_partial_1_3 NO_STORE
+#define vstore_partial_1_4 NO_STORE
+#define vstore_partial_1_5 NO_STORE
+#define vstore_partial_1_6 NO_STORE
+#define vstore_partial_1_7 NO_STORE
+#define vstore_partial_1_8 NO_STORE
+#define vstore_partial_1_9 NO_STORE
#define vstore_partial_1_10 NO_STORE
#define vstore_partial_1_11 NO_STORE
#define vstore_partial_1_12 NO_STORE
@@ -541,16 +555,16 @@
#define vstore_partial_1_15 NO_STORE
#define vstore_partial_1_16 NO_STORE
// Size == 2
-#define vstore_partial_2_0 NO_STORE
-#define vstore_partial_2_1 vstore_partial_1
-#define vstore_partial_2_2 vstore_partial_2
-#define vstore_partial_2_3 NO_STORE
-#define vstore_partial_2_4 NO_STORE
-#define vstore_partial_2_5 NO_STORE
-#define vstore_partial_2_6 NO_STORE
-#define vstore_partial_2_7 NO_STORE
-#define vstore_partial_2_8 NO_STORE
-#define vstore_partial_2_9 NO_STORE
+#define vstore_partial_2_0 NO_STORE
+#define vstore_partial_2_1 vstore_partial_1
+#define vstore_partial_2_2 vstore_partial_2
+#define vstore_partial_2_3 NO_STORE
+#define vstore_partial_2_4 NO_STORE
+#define vstore_partial_2_5 NO_STORE
+#define vstore_partial_2_6 NO_STORE
+#define vstore_partial_2_7 NO_STORE
+#define vstore_partial_2_8 NO_STORE
+#define vstore_partial_2_9 NO_STORE
#define vstore_partial_2_10 NO_STORE
#define vstore_partial_2_11 NO_STORE
#define vstore_partial_2_12 NO_STORE
@@ -559,16 +573,16 @@
#define vstore_partial_2_15 NO_STORE
#define vstore_partial_2_16 NO_STORE
// Size == 3
-#define vstore_partial_3_0 NO_STORE
-#define vstore_partial_3_1 vstore_partial_1
-#define vstore_partial_3_2 vstore_partial_2
-#define vstore_partial_3_3 vstore_partial_3
-#define vstore_partial_3_4 NO_STORE
-#define vstore_partial_3_5 NO_STORE
-#define vstore_partial_3_6 NO_STORE
-#define vstore_partial_3_7 NO_STORE
-#define vstore_partial_3_8 NO_STORE
-#define vstore_partial_3_9 NO_STORE
+#define vstore_partial_3_0 NO_STORE
+#define vstore_partial_3_1 vstore_partial_1
+#define vstore_partial_3_2 vstore_partial_2
+#define vstore_partial_3_3 vstore_partial_3
+#define vstore_partial_3_4 NO_STORE
+#define vstore_partial_3_5 NO_STORE
+#define vstore_partial_3_6 NO_STORE
+#define vstore_partial_3_7 NO_STORE
+#define vstore_partial_3_8 NO_STORE
+#define vstore_partial_3_9 NO_STORE
#define vstore_partial_3_10 NO_STORE
#define vstore_partial_3_11 NO_STORE
#define vstore_partial_3_12 NO_STORE
@@ -577,16 +591,16 @@
#define vstore_partial_3_15 NO_STORE
#define vstore_partial_3_16 NO_STORE
// Size == 4
-#define vstore_partial_4_0 NO_STORE
-#define vstore_partial_4_1 vstore_partial_1
-#define vstore_partial_4_2 vstore_partial_2
-#define vstore_partial_4_3 vstore_partial_3
-#define vstore_partial_4_4 vstore_partial_4
-#define vstore_partial_4_5 NO_STORE
-#define vstore_partial_4_6 NO_STORE
-#define vstore_partial_4_7 NO_STORE
-#define vstore_partial_4_8 NO_STORE
-#define vstore_partial_4_9 NO_STORE
+#define vstore_partial_4_0 NO_STORE
+#define vstore_partial_4_1 vstore_partial_1
+#define vstore_partial_4_2 vstore_partial_2
+#define vstore_partial_4_3 vstore_partial_3
+#define vstore_partial_4_4 vstore_partial_4
+#define vstore_partial_4_5 NO_STORE
+#define vstore_partial_4_6 NO_STORE
+#define vstore_partial_4_7 NO_STORE
+#define vstore_partial_4_8 NO_STORE
+#define vstore_partial_4_9 NO_STORE
#define vstore_partial_4_10 NO_STORE
#define vstore_partial_4_11 NO_STORE
#define vstore_partial_4_12 NO_STORE
@@ -595,16 +609,16 @@
#define vstore_partial_4_15 NO_STORE
#define vstore_partial_4_16 NO_STORE
// Size == 8
-#define vstore_partial_8_0 NO_STORE
-#define vstore_partial_8_1 vstore_partial_1
-#define vstore_partial_8_2 vstore_partial_2
-#define vstore_partial_8_3 vstore_partial_3
-#define vstore_partial_8_4 vstore_partial_4
-#define vstore_partial_8_5 vstore_partial_5
-#define vstore_partial_8_6 vstore_partial_6
-#define vstore_partial_8_7 vstore_partial_7
-#define vstore_partial_8_8 vstore_partial_8
-#define vstore_partial_8_9 NO_STORE
+#define vstore_partial_8_0 NO_STORE
+#define vstore_partial_8_1 vstore_partial_1
+#define vstore_partial_8_2 vstore_partial_2
+#define vstore_partial_8_3 vstore_partial_3
+#define vstore_partial_8_4 vstore_partial_4
+#define vstore_partial_8_5 vstore_partial_5
+#define vstore_partial_8_6 vstore_partial_6
+#define vstore_partial_8_7 vstore_partial_7
+#define vstore_partial_8_8 vstore_partial_8
+#define vstore_partial_8_9 NO_STORE
#define vstore_partial_8_10 NO_STORE
#define vstore_partial_8_11 NO_STORE
#define vstore_partial_8_12 NO_STORE
@@ -613,16 +627,16 @@
#define vstore_partial_8_15 NO_STORE
#define vstore_partial_8_16 NO_STORE
// Size == 16
-#define vstore_partial_16_0 NO_STORE
-#define vstore_partial_16_1 vstore_partial_1
-#define vstore_partial_16_2 vstore_partial_2
-#define vstore_partial_16_3 vstore_partial_3
-#define vstore_partial_16_4 vstore_partial_4
-#define vstore_partial_16_5 vstore_partial_5
-#define vstore_partial_16_6 vstore_partial_6
-#define vstore_partial_16_7 vstore_partial_7
-#define vstore_partial_16_8 vstore_partial_8
-#define vstore_partial_16_9 vstore_partial_9
+#define vstore_partial_16_0 NO_STORE
+#define vstore_partial_16_1 vstore_partial_1
+#define vstore_partial_16_2 vstore_partial_2
+#define vstore_partial_16_3 vstore_partial_3
+#define vstore_partial_16_4 vstore_partial_4
+#define vstore_partial_16_5 vstore_partial_5
+#define vstore_partial_16_6 vstore_partial_6
+#define vstore_partial_16_7 vstore_partial_7
+#define vstore_partial_16_8 vstore_partial_8
+#define vstore_partial_16_9 vstore_partial_9
#define vstore_partial_16_10 vstore_partial_10
#define vstore_partial_16_11 vstore_partial_11
#define vstore_partial_16_12 vstore_partial_12
@@ -648,17 +662,13 @@
* @param[in] PTR The base pointer
* @{
*/
-#define vstore_partial_1(DATA, OFFSET, PTR) \
- vstore1(DATA.s0, OFFSET, PTR);
+#define vstore_partial_1(DATA, OFFSET, PTR) vstore1(DATA.s0, OFFSET, PTR);
-#define vstore_partial_2(DATA, OFFSET, PTR) \
- vstore2(DATA.s01, OFFSET, PTR);
+#define vstore_partial_2(DATA, OFFSET, PTR) vstore2(DATA.s01, OFFSET, PTR);
-#define vstore_partial_3(DATA, OFFSET, PTR) \
- vstore3(DATA.s012, OFFSET, PTR);
+#define vstore_partial_3(DATA, OFFSET, PTR) vstore3(DATA.s012, OFFSET, PTR);
-#define vstore_partial_4(DATA, OFFSET, PTR) \
- vstore4(DATA.s0123, OFFSET, PTR);
+#define vstore_partial_4(DATA, OFFSET, PTR) vstore4(DATA.s0123, OFFSET, PTR);
#define vstore_partial_5(DATA, OFFSET, PTR) \
vstore_partial_4(DATA.s0123, OFFSET, PTR); \
@@ -672,8 +682,7 @@
vstore_partial_4(DATA.s0123, OFFSET, PTR); \
vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
-#define vstore_partial_8(DATA, OFFSET, PTR) \
- vstore8(DATA.s01234567, OFFSET, PTR);
+#define vstore_partial_8(DATA, OFFSET, PTR) vstore8(DATA.s01234567, OFFSET, PTR);
#define vstore_partial_9(DATA, OFFSET, PTR) \
vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
@@ -703,186 +712,156 @@
vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
-#define vstore_partial_16(DATA, OFFSET, PTR) \
- vstore16(DATA, OFFSET, PTR);
+#define vstore_partial_16(DATA, OFFSET, PTR) vstore16(DATA, OFFSET, PTR);
/** @} */ // end of groupd vstore_partial_n
/** @} */ // end of groupd VSTORE_PARTIAL
// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
// without _sat to overcome this issue
-#define convert_float_sat convert_float
-#define convert_float1_sat convert_float
-#define convert_float2_sat convert_float2
-#define convert_float3_sat convert_float3
-#define convert_float4_sat convert_float4
-#define convert_float8_sat convert_float8
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
#define convert_float16_sat convert_float16
-#define convert_half_sat convert_float
-#define convert_half1_sat convert_half
-#define convert_half2_sat convert_half2
-#define convert_half3_sat convert_half3
-#define convert_half4_sat convert_half4
-#define convert_half8_sat convert_half8
-#define convert_half16_sat convert_half16
-
-#define convert_float1 convert_float
-#define convert_half1 convert_half
-#define convert_char1 convert_char
-#define convert_uchar1 convert_uchar
-#define convert_short1 convert_short
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
#define convert_ushort1 convert_ushort
-#define convert_int1 convert_int
-#define convert_uint1 convert_uint
-#define convert_long1 convert_long
-#define convert_ulong1 convert_ulong
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
#define convert_double1 convert_double
-#define convert_char1_sat convert_char_sat
-#define convert_uchar1_sat convert_uchar_sat
-#define convert_uchar2_sat convert_uchar2_sat
-#define convert_uchar3_sat convert_uchar3_sat
-#define convert_uchar4_sat convert_uchar4_sat
-#define convert_uchar8_sat convert_uchar8_sat
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_uchar2_sat convert_uchar2_sat
+#define convert_uchar3_sat convert_uchar3_sat
+#define convert_uchar4_sat convert_uchar4_sat
+#define convert_uchar8_sat convert_uchar8_sat
#define convert_uchar16_sat convert_uchar16_sat
-#define convert_short1_sat convert_short_sat
+#define convert_short1_sat convert_short_sat
#define convert_ushort1_sat convert_ushort_sat
-#define convert_int1_sat convert_int_sat
-#define convert_uint1_sat convert_uint_sat
-#define convert_long1_sat convert_long_sat
-#define convert_ulong1_sat convert_ulong_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
#define convert_double1_sat convert_double_sat
#define VEC_DATA_TYPE_STR(type, size) type##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
#define CONVERT_STR(x, type) (convert_##type((x)))
-#define CONVERT(x, type) CONVERT_STR(x, type)
+#define CONVERT(x, type) CONVERT_STR(x, type)
#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
-#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
-#define select_vec_dt_uchar(size) uchar##size
-#define select_vec_dt_char(size) char##size
+#define select_vec_dt_uchar(size) uchar##size
+#define select_vec_dt_char(size) char##size
#define select_vec_dt_ushort(size) ushort##size
-#define select_vec_dt_short(size) short##size
-#define select_vec_dt_half(size) short##size
-#define select_vec_dt_uint(size) uint##size
-#define select_vec_dt_int(size) int##size
-#define select_vec_dt_float(size) int##size
-#define select_vec_dt_ulong(size) ulong##size
-#define select_vec_dt_long(size) long##size
+#define select_vec_dt_short(size) short##size
+#define select_vec_dt_half(size) short##size
+#define select_vec_dt_uint(size) uint##size
+#define select_vec_dt_int(size) int##size
+#define select_vec_dt_float(size) int##size
+#define select_vec_dt_ulong(size) ulong##size
+#define select_vec_dt_long(size) long##size
#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
-#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
-#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
+#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
+#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
-#define signed_int_vec_dt_uchar(size) char##size
-#define signed_int_vec_dt_char(size) char##size
+#define signed_int_vec_dt_uchar(size) char##size
+#define signed_int_vec_dt_char(size) char##size
#define signed_int_vec_dt_ushort(size) short##size
-#define signed_int_vec_dt_short(size) short##size
-#define signed_int_vec_dt_half(size) short##size
-#define signed_int_vec_dt_uint(size) int##size
-#define signed_int_vec_dt_int(size) int##size
-#define signed_int_vec_dt_float(size) int##size
-#define signed_int_vec_dt_ulong(size) long##size
-#define signed_int_vec_dt_long(size) long##size
+#define signed_int_vec_dt_short(size) short##size
+#define signed_int_vec_dt_half(size) short##size
+#define signed_int_vec_dt_uint(size) int##size
+#define signed_int_vec_dt_int(size) int##size
+#define signed_int_vec_dt_float(size) int##size
+#define signed_int_vec_dt_ulong(size) long##size
+#define signed_int_vec_dt_long(size) long##size
#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
-#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
-#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
-
-#define sum_reduce_1(x) (x)
-#define sum_reduce_2(x) ((x).s0) + ((x).s1)
-#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
-#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
-#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
+#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
+#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
+
+#define sum_reduce_1(x) (x)
+#define sum_reduce_2(x) ((x).s0) + ((x).s1)
+#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
+#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
+#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
-#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
+#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
-#define prod_reduce_1(x) (x)
-#define prod_reduce_2(x) ((x).s0) * ((x).s1)
-#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
-#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
-#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
+#define prod_reduce_1(x) (x)
+#define prod_reduce_2(x) ((x).s0) * ((x).s1)
+#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
+#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
+#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
-#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
+#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
-#define max_reduce_1(x) (x)
-#define max_reduce_2(x) max(((x).s0), ((x).s1))
-#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
-#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
-#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
+#define max_reduce_1(x) (x)
+#define max_reduce_2(x) max(((x).s0), ((x).s1))
+#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
+#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
+#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
-#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
+#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
-#define min_reduce_1(x) (x)
-#define min_reduce_2(x) min(((x).s0), ((x).s1))
-#define min_reduce_3(x) min(min_reduce_2((x).s01), ((x).s2))
-#define min_reduce_4(x) min(min_reduce_2((x).s01), min_reduce_2((x).s23))
-#define min_reduce_8(x) min(min_reduce_4((x).s0123), min_reduce_4((x).s4567))
+#define min_reduce_1(x) (x)
+#define min_reduce_2(x) min(((x).s0), ((x).s1))
+#define min_reduce_3(x) min(min_reduce_2((x).s01), ((x).s2))
+#define min_reduce_4(x) min(min_reduce_2((x).s01), min_reduce_2((x).s23))
+#define min_reduce_8(x) min(min_reduce_4((x).s0123), min_reduce_4((x).s4567))
#define min_reduce_16(x) min(min_reduce_8((x).s01234567), min_reduce_8((x).s89ABCDEF))
#define MIN_REDUCE_STR(x, size) min_reduce_##size(x)
-#define MIN_REDUCE(x, size) MIN_REDUCE_STR(x, size)
-
-#define VECTOR_DECLARATION(name) \
- __global uchar *name##_ptr, \
- uint name##_stride_x, \
- uint name##_step_x, \
- uint name##_offset_first_element_in_bytes
-
-#define IMAGE_DECLARATION(name) \
- __global uchar *name##_ptr, \
- uint name##_stride_x, \
- uint name##_step_x, \
- uint name##_stride_y, \
- uint name##_step_y, \
- uint name##_offset_first_element_in_bytes
-
-#define TENSOR3D_DECLARATION(name) \
- __global uchar *name##_ptr, \
- uint name##_stride_x, \
- uint name##_step_x, \
- uint name##_stride_y, \
- uint name##_step_y, \
- uint name##_stride_z, \
- uint name##_step_z, \
- uint name##_offset_first_element_in_bytes
-
-#define TENSOR4D_DECLARATION(name) \
- __global uchar *name##_ptr, \
- uint name##_stride_x, \
- uint name##_step_x, \
- uint name##_stride_y, \
- uint name##_step_y, \
- uint name##_stride_z, \
- uint name##_step_z, \
- uint name##_stride_w, \
- uint name##_step_w, \
- uint name##_offset_first_element_in_bytes
-
-#define TENSOR5D_DECLARATION(name) \
- __global uchar *name##_ptr, \
- uint name##_stride_x, \
- uint name##_step_x, \
- uint name##_stride_y, \
- uint name##_step_y, \
- uint name##_stride_z, \
- uint name##_step_z, \
- uint name##_stride_w, \
- uint name##_step_w, \
- uint name##_stride_v, \
- uint name##_step_v, \
- uint name##_offset_first_element_in_bytes
+#define MIN_REDUCE(x, size) MIN_REDUCE_STR(x, size)
+
+#define VECTOR_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+ uint name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+ uint name##_stride_z, uint name##_step_z, uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+ uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, \
+ uint name##_offset_first_element_in_bytes
+
+#define TENSOR5D_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+ uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, uint name##_stride_v, \
+ uint name##_step_v, uint name##_offset_first_element_in_bytes
#define CONVERT_TO_VECTOR_STRUCT(name) \
update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
@@ -890,38 +869,47 @@
#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
-#define CONVERT_TO_IMAGE_STRUCT(name) \
- update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+ update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+ name##_stride_y, name##_step_y)
#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
- update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \
+ name##_step_z)
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
- update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+ name##_stride_y, 0, name##_stride_z, name##_step_z)
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
- update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \
+ name##_step_z)
-#define CONVERT_TO_TENSOR3D_STRUCT(name) \
- update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
- name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT(name) \
+ update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+ name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
- update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+ update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+ name##_stride_y, 0, name##_stride_z, 0)
-#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \
- update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
- name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \
+ update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+ name##_stride_y, name##_step_y, name##_stride_z, name##_step_z, name##_stride_w, \
+ name##_step_w, mod_size)
-#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
- update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+ update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+ name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \
- tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
- name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \
+ tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+ name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
/** Structure to hold Vector information */
typedef struct Vector
@@ -970,10 +958,10 @@ typedef struct Tensor4D
*
* @return An image object
*/
-inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+inline Vector
+update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
{
- Vector vector =
- {
+ Vector vector = {
.ptr = ptr,
.offset_first_element_in_bytes = offset_first_element_in_bytes,
.stride_x = stride_x,
@@ -993,15 +981,13 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_
*
* @return An image object
*/
-inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+inline Image update_image_workitem_ptr(
+ __global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
{
- Image img =
- {
- .ptr = ptr,
- .offset_first_element_in_bytes = offset_first_element_in_bytes,
- .stride_x = stride_x,
- .stride_y = stride_y
- };
+ Image img = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y};
img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
return img;
}
@@ -1019,16 +1005,21 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
*
* @return A 3D tensor object
*/
-inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+ uint offset_first_element_in_bytes,
+ uint stride_x,
+ uint step_x,
+ uint stride_y,
+ uint step_y,
+ uint stride_z,
+ uint step_z)
{
- Image img =
- {
- .ptr = ptr,
- .offset_first_element_in_bytes = offset_first_element_in_bytes,
- .stride_x = stride_x,
- .stride_y = stride_y
- };
- img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+ Image img = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y};
+ img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+ get_global_id(2) * step_z;
return img;
}
@@ -1045,17 +1036,22 @@ inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint o
*
* @return A 3D tensor object
*/
-inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
+ uint offset_first_element_in_bytes,
+ uint stride_x,
+ uint step_x,
+ uint stride_y,
+ uint step_y,
+ uint stride_z,
+ uint step_z)
{
- Tensor3D tensor =
- {
- .ptr = ptr,
- .offset_first_element_in_bytes = offset_first_element_in_bytes,
- .stride_x = stride_x,
- .stride_y = stride_y,
- .stride_z = stride_z
- };
- tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+ Tensor3D tensor = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y,
+ .stride_z = stride_z};
+ tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+ get_global_id(2) * step_z;
return tensor;
}
@@ -1072,34 +1068,44 @@ inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_fi
*
* @return A 3D tensor object
*/
-inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr,
+ uint offset_first_element_in_bytes,
+ uint stride_x,
+ uint step_x,
+ uint stride_y,
+ uint step_y,
+ uint stride_z,
+ uint step_z)
{
- Tensor3D tensor =
- {
- .ptr = ptr,
- .offset_first_element_in_bytes = offset_first_element_in_bytes,
- .stride_x = stride_x,
- .stride_y = stride_y,
- .stride_z = stride_z
- };
+ Tensor3D tensor = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y,
+ .stride_z = stride_z};
return tensor;
}
-inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
- uint step_w,
- uint mod_size)
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
+ uint offset_first_element_in_bytes,
+ uint stride_x,
+ uint step_x,
+ uint stride_y,
+ uint step_y,
+ uint stride_z,
+ uint step_z,
+ uint stride_w,
+ uint step_w,
+ uint mod_size)
{
- Tensor4D tensor =
- {
- .ptr = ptr,
- .offset_first_element_in_bytes = offset_first_element_in_bytes,
- .stride_x = stride_x,
- .stride_y = stride_y,
- .stride_z = stride_z,
- .stride_w = stride_w
- };
-
- tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+ Tensor4D tensor = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y,
+ .stride_z = stride_z,
+ .stride_w = stride_w};
+
+ tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+ (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
return tensor;
}
@@ -1171,7 +1177,8 @@ inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint wid
const uint x = index;
- return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
+ return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
+ tensor->offset_first_element_in_bytes;
}
#endif // _HELPER_H
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index 562c5d3236..166260a3c0 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -34,7 +34,7 @@
* @return The converted vector
*/
#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
/** Quantize a floating-point scalar value to 8-bit asymmetric
*
@@ -84,14 +84,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return quantized values
*/
-#define QUANTIZE_IMPL(type, size) \
- inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
- { \
- VEC_DATA_TYPE(float, size) \
- out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
- VEC_DATA_TYPE(type, size) \
- res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \
- return res; \
+#define QUANTIZE_IMPL(type, size) \
+ inline VEC_DATA_TYPE(type, size) \
+ quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+ { \
+ VEC_DATA_TYPE(float, size) \
+ out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
+ VEC_DATA_TYPE(type, size) \
+ res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \
+ return res; \
}
/** Dequantize a vector of values to floating-point
@@ -101,10 +102,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return dequantized values in floating point
*/
-#define DEQUANTIZE_IMPL(type, size) \
- inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
- { \
- return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \
+#define DEQUANTIZE_IMPL(type, size) \
+ inline VEC_DATA_TYPE(float, size) \
+ dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+ { \
+ return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \
}
/** Correctly-rounded-to-nearest division by a power-of-two.
@@ -113,18 +115,17 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return Correctly-rounded-to-nearest division by a power-of-two.
*/
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
- { \
- const VEC_DATA_TYPE(int, size) \
- zero = (VEC_DATA_TYPE(int, size))0; \
- const VEC_DATA_TYPE(int, size) \
- one = (VEC_DATA_TYPE(int, size))1; \
- VEC_DATA_TYPE(int, size) \
- mask = (one << exponent) - one; \
- VEC_DATA_TYPE(int, size) \
- threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \
- return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+ { \
+ const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \
+ const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \
+ VEC_DATA_TYPE(int, size) \
+ mask = (one << exponent) - one; \
+ VEC_DATA_TYPE(int, size) \
+ threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \
+ return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \
}
/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -167,27 +168,29 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return Result in fixed-point format Q0.
*/
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
- { \
- const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \
- const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \
- const int k_fractional_bits = 31; \
- VEC_DATA_TYPE(int, size) \
- x = a + (1 << (k_fractional_bits - 3)); \
- VEC_DATA_TYPE(int, size) \
- x2 = ASYMM_MULT(x, x, size); \
- VEC_DATA_TYPE(int, size) \
- x3 = ASYMM_MULT(x2, x, size); \
- VEC_DATA_TYPE(int, size) \
- x4 = ASYMM_MULT(x2, x2, size); \
- VEC_DATA_TYPE(int, size) \
- x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \
- VEC_DATA_TYPE(int, size) \
- x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \
- VEC_DATA_TYPE(int, size) \
- x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \
- return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+ { \
+ const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \
+ const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \
+ const int k_fractional_bits = 31; \
+ VEC_DATA_TYPE(int, size) \
+ x = a + (1 << (k_fractional_bits - 3)); \
+ VEC_DATA_TYPE(int, size) \
+ x2 = ASYMM_MULT(x, x, size); \
+ VEC_DATA_TYPE(int, size) \
+ x3 = ASYMM_MULT(x2, x, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4 = ASYMM_MULT(x2, x2, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_24_plus_x3_over_6_plus_x2_over_2 = \
+ ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \
+ return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
}
/** Each bit of the result is set to the corresponding bit of either then_val or
@@ -198,10 +201,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
*/
-#define ASYMM_SELECT_USING_MASK_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
- { \
- return (if_mask & then_val) ^ (~if_mask & else_val); \
+#define ASYMM_SELECT_USING_MASK_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size( \
+ VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+ { \
+ return (if_mask & then_val) ^ (~if_mask & else_val); \
}
/** For each element of input vector, the corresponding bits of the result item are set
@@ -234,18 +238,19 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0)); \
}
-#define EXP_BARREL_SHIFTER_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
- { \
- if(k_integer_bits > exponent) \
- { \
- const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
- return ASYMM_SELECT_USING_MASK( \
- ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \
- ASYMM_MULT(result, fp_multiplier, size), result, size); \
- } \
- \
- return result; \
+#define EXP_BARREL_SHIFTER_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \
+ int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+ { \
+ if (k_integer_bits > exponent) \
+ { \
+ const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
+ return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \
+ ASYMM_MULT(result, fp_multiplier, size), result, size); \
+ } \
+ \
+ return result; \
}
/** Calculates \f$ exp(x) \f$ for x < 0.
@@ -254,39 +259,40 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return Result in fixed-point format Q0.
*/
-#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
- { \
- const int k_fractional_bits = 31 - k_integer_bits; \
- VEC_DATA_TYPE(int, size) \
- k_one_quarter = 1 << (k_fractional_bits - 2); \
- VEC_DATA_TYPE(int, size) \
- mask = k_one_quarter - 1; \
- VEC_DATA_TYPE(int, size) \
- a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \
- VEC_DATA_TYPE(int, size) \
- a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \
- VEC_DATA_TYPE(int, size) \
- result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
- VEC_DATA_TYPE(int, size) \
- remainder = a_mod_quarter_minus_one_quarter - a; \
- \
- result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \
- result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \
- result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \
- result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \
- result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \
- result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \
- result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \
- \
- if(k_integer_bits > 5) \
- { \
- const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \
- result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \
- } \
- \
- const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \
- return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
+ { \
+ const int k_fractional_bits = 31 - k_integer_bits; \
+ VEC_DATA_TYPE(int, size) \
+ k_one_quarter = 1 << (k_fractional_bits - 2); \
+ VEC_DATA_TYPE(int, size) \
+ mask = k_one_quarter - 1; \
+ VEC_DATA_TYPE(int, size) \
+ a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \
+ VEC_DATA_TYPE(int, size) \
+ a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \
+ VEC_DATA_TYPE(int, size) \
+ result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, \
+ size); \
+ VEC_DATA_TYPE(int, size) \
+ remainder = a_mod_quarter_minus_one_quarter - a; \
+ \
+ result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \
+ \
+ if (k_integer_bits > 5) \
+ { \
+ const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \
+ result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \
+ } \
+ \
+ const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \
+ return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \
}
/** Calculates the product of a integer value by a power of two, with either a positive exponent
@@ -297,26 +303,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return Arithmetic left or right shift.
*/
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
- { \
- if(exponent < 0) \
- { \
- return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \
- } \
- \
- const VEC_DATA_TYPE(int, size) min = INT_MIN; \
- const VEC_DATA_TYPE(int, size) max = INT_MAX; \
- int threshold = ((1 << (31 - exponent)) - 1); \
- VEC_DATA_TYPE(int, size) \
- positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \
- VEC_DATA_TYPE(int, size) \
- negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \
- VEC_DATA_TYPE(int, size) \
- result = x << exponent; \
- result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \
- result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \
- return result; \
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+ { \
+ if (exponent < 0) \
+ { \
+ return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \
+ } \
+ \
+ const VEC_DATA_TYPE(int, size) min = INT_MIN; \
+ const VEC_DATA_TYPE(int, size) max = INT_MAX; \
+ int threshold = ((1 << (31 - exponent)) - 1); \
+ VEC_DATA_TYPE(int, size) \
+ positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \
+ VEC_DATA_TYPE(int, size) \
+ negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \
+ VEC_DATA_TYPE(int, size) \
+ result = x << exponent; \
+ result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \
+ result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \
+ return result; \
}
/** Calculates (a+b)/2, rounded to the nearest integer.
@@ -326,20 +333,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return (a+b)/2, rounded to the nearest integer.
*/
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
- { \
- VEC_DATA_TYPE(long, size) \
- a64 = convert_long##size(a); \
- VEC_DATA_TYPE(long, size) \
- b64 = convert_long##size(b); \
- VEC_DATA_TYPE(long, size) \
- sum = a64 + b64; \
- const VEC_DATA_TYPE(long, size) one = 1; \
- const VEC_DATA_TYPE(long, size) minus_one = -1; \
- VEC_DATA_TYPE(long, size) \
- sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \
- return convert_int##size((sum + sign) / 2); \
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+ { \
+ VEC_DATA_TYPE(long, size) \
+ a64 = convert_long##size(a); \
+ VEC_DATA_TYPE(long, size) \
+ b64 = convert_long##size(b); \
+ VEC_DATA_TYPE(long, size) \
+ sum = a64 + b64; \
+ const VEC_DATA_TYPE(long, size) one = 1; \
+ const VEC_DATA_TYPE(long, size) minus_one = -1; \
+ VEC_DATA_TYPE(long, size) \
+ sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \
+ return convert_int##size((sum + sign) / 2); \
}
/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
@@ -354,12 +362,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \
const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \
VEC_DATA_TYPE(int, size) \
- half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \
+ half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \
const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \
const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \
VEC_DATA_TYPE(int, size) \
x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \
- for(int i = 0; i < 3; i++) \
+ for (int i = 0; i < 3; i++) \
{ \
VEC_DATA_TYPE(int, size) \
half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \
@@ -378,48 +386,57 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return Rescaled value.
*/
-#define ASYMM_RESCALE_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
- { \
- int exponent = src_integer_bits - dst_integer_bits; \
- return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \
+#define ASYMM_RESCALE_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+ { \
+ int exponent = src_integer_bits - dst_integer_bits; \
+ return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \
}
-#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
-#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
-#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
-#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
-#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
+#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
-#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
-#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+ asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
+ asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
-#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) \
+ exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
-#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
+ asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
-#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
-#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
-
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
- { \
- const int left_shift = shift > 0 ? shift : 0; \
- const int right_shift = shift > 0 ? 0 : -shift; \
- return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \
+#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) \
+ asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+ ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
+
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
+ { \
+ const int left_shift = shift > 0 ? shift : 0; \
+ const int right_shift = shift > 0 ? 0 : -shift; \
+ return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \
}
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+ multiply_by_quantized_multiplier##size(input, qmul, shift)
QUANTIZE_IMPL(uchar, 1)
QUANTIZE_IMPL(char, 1)
diff --git a/src/core/CL/cl_kernels/load_store_utility.h b/src/core/CL/cl_kernels/load_store_utility.h
index 4ba2b2ca3a..4daf0adc89 100644
--- a/src/core/CL/cl_kernels/load_store_utility.h
+++ b/src/core/CL/cl_kernels/load_store_utility.h
@@ -223,8 +223,10 @@
* @param[in] Z The offset in z-axis direction
* @{
*/
-#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
/** @} */ // end of group STORE_BLOCK
/** Convert and store a block of the given size M0xN0
@@ -245,8 +247,10 @@
* @param[in] Z The offset in z-axis direction
* @{
*/
-#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
/** @} */ // end of group CONVERT_STORE_BLOCK
/** Partially store the 0 to (n-1)th rows of the given variables
@@ -365,8 +369,10 @@
* @param[in] Z The offset in z-axis direction
* @{
*/
-#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
/** Store a block that can be partial in both x and y dimensions
*
* @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
@@ -388,22 +394,23 @@
* @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
* @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
*/
-#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \
- { \
- STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
- } \
- else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \
- { \
- STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
- } \
- else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \
- { \
- STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
- } \
- else \
- { \
- STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
+#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+ PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+ if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \
+ { \
+ STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
+ } \
+ else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \
+ { \
+ STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
+ } \
+ else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \
+ { \
+ STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
+ } \
+ else \
+ { \
+ STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
}
/** Store a block that can only be partial in x but not y.
*
@@ -425,7 +432,7 @@
* @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
*/
#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
- if(!(PARTIAL_COND_X)) \
+ if (!(PARTIAL_COND_X)) \
{ \
STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
} \
@@ -453,7 +460,7 @@
* @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
*/
#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
- if(!(PARTIAL_COND_Y)) \
+ if (!(PARTIAL_COND_Y)) \
{ \
STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
} \
@@ -517,23 +524,28 @@
#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
// Case1: No partial blocks in either x or y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+ PARTIAL_COND_Y, PARTIAL_COND_X) \
STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
// Case2: Partial blocks in y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+ PARTIAL_COND_Y, PARTIAL_COND_X) \
STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
// Case3: Partial blocks in x
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+ PARTIAL_COND_Y, PARTIAL_COND_X) \
STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
// Case4: Partial blocks in both x and y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+ PARTIAL_COND_Y, PARTIAL_COND_X) \
+ STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+ PARTIAL_COND_Y, PARTIAL_COND_X)
#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
@@ -560,8 +572,7 @@
#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
#else // defined(PARTIAL_STORE_M0)
-#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
- ((uint)(y * M0))
+#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) ((uint)(y * M0))
#endif // defined(PARTIAL_STORE_M0)
/** @} */ // end of group COMPUTE_M0_START_ROW
diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
index bed94a7b3b..cb2f4b0319 100644
--- a/src/core/CL/cl_kernels/repeat.h
+++ b/src/core/CL/cl_kernels/repeat.h
@@ -75,7 +75,9 @@
P_X##_DEF(F, P_A, P_B, P_C); \
REPEAT_3_15(P_X, P_A, P_B, P_C)
-#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \
+ REPEAT_3_##P_NUM(P_OP, P_A, P_B, \
+ P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
// Repeat macros with 4 param, excluding the implicit ID param
@@ -126,52 +128,59 @@
P_X##_DEF(F, P_A, P_B, P_C, P_D); \
REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
-#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \
+ REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, \
+ P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
// Macro for initializing N variables. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
// Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
+#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
// Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
-#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
+#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \
+ REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
// Macro for adding a constant to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
+#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
// Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...)
-#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
+#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
// Macro for adding a vector to N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
-#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
+#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
// Macro for adding a two N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
-#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
+#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
// Macro for performing Max between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
+#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
// Macro for performing Min between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
+#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+ VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+ REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+ VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+ REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
// Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables.
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
@@ -182,6 +191,7 @@
VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \
VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \
})
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+ REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
#endif // ARM_COMPUTE_REPEAT_H
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 642483ab3c..6595bd1981 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -31,11 +31,13 @@
* @param[in] border_size Border size of the image
*
*/
-inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+inline const float8
+clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
{
const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
- return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+ return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3,
+ clamped_y.s3);
}
/** Clamps the given coordinates to the borders.
@@ -74,7 +76,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int
*/
inline const float8 get_neighbour_coords(const float2 coord)
{
- return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+ return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1,
+ /*br*/ coord.s0 + 1, coord.s1 + 1);
}
/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
@@ -85,37 +88,38 @@ inline const float8 get_neighbour_coords(const float2 coord)
* @param[in] height Height of the image
* @param[in] border_size Border size
*/
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(
+ const Image *in, const float8 coords, const float width, const float height, const float border_size)
{
// If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
// Sets the 4x4 coordinates for each of the four input texels
const float8 fc = floor(coords);
- const float16 c1 = (float16)(
- clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
- clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
- const float16 c2 = (float16)(
- clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
- clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+ const float16 c1 =
+ (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+ clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+ const float16 c2 =
+ (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+ clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
// Loads the values from the input image
const float16 t = (float16)(
- /* tl, tr, bl, br */
- * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
- *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
- *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
- *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
- *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
- *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
- *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
- *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
- const float8 a = coords - fc;
- const float8 b = ((float8)(1.f)) - a;
- const float4 fr = (float4)(
- ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
- ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
- ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
- ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+ /* tl, tr, bl, br */
+ *((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+ *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+ *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+ *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+ *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+ *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+ *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+ *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+ const float8 a = coords - fc;
+ const float8 b = ((float8)(1.f)) - a;
+ const float4 fr =
+ (float4)(((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+ ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+ ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+ ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
}
@@ -126,7 +130,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const
* @param[in] width Width of the image
* @param[in] height Height of the image
*/
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4)
+ bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
{
return bilinear_interpolate_with_border(in, coords, width, height, 1);
}
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index 2728958add..5b72354abe 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -44,16 +45,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+ "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64, DataType::U64);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64,
+ DataType::U64);
}
return Status{};
@@ -66,22 +71,34 @@ CLArgMinMaxLayerKernel::CLArgMinMaxLayerKernel()
_type = CLKernelType::ELEMENTWISE;
}
-void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
}
-void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape{ input->info()->tensor_shape() };
+ TensorShape output_shape{input->info()->tensor_shape()};
output_shape.set(axis, 1);
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(DataType::S32).reset_padding().set_is_resizable(true));
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(DataType::S32)
+ .reset_padding()
+ .set_is_resizable(true));
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
@@ -90,11 +107,14 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
// Set build options
const auto adjusted_vector_size = adjust_vec_size(16U, input->info()->dimension(0));
- const auto vector_size = (adjusted_vector_size == 3U && axis == 0U) ? 2U : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
+ const auto vector_size = (adjusted_vector_size == 3U && axis == 0U)
+ ? 2U
+ : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(input->info()->dimension(0) % vector_size));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vector_size));
build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
@@ -104,7 +124,7 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
// Create kernel
std::string kernel_axis_name;
- switch(axis)
+ switch (axis)
{
case 0:
build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
@@ -135,7 +155,10 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
return Status{};
@@ -146,7 +169,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- switch(_reduction_axis)
+ switch (_reduction_axis)
{
case 0:
{
@@ -154,7 +177,8 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
Window out_window(window);
Window in_window(window);
out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
- in_window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+ in_window.set(Window::DimX,
+ Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
in_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1u));
// Get first input and output slices
@@ -166,15 +190,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_2D_tensor_argument(idx, _input, in_slice);
add_2D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+ } while (in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
}
break;
case 1:
{
// Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+ Window window_in{window};
+ window_in.set(Window::DimY,
+ Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
Window in_slice = window_in.first_slice_window_2D();
Window out_slice = window.first_slice_window_2D();
@@ -184,15 +208,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_2D_tensor_argument(idx, _input, in_slice);
add_2D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
}
break;
case 2:
{
// Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+ Window window_in{window};
+ window_in.set(Window::DimZ,
+ Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
Window in_slice = window_in.first_slice_window_3D();
Window out_slice = window.first_slice_window_3D();
@@ -202,14 +226,13 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_3D_tensor_argument(idx, _input, in_slice);
add_3D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+ } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
}
break;
case 3:
{
// Get first input and output slices
- Window window_in{ window };
+ Window window_in{window};
window_in.set(3, Window::Dimension(0, 1, 1));
Window in_slice = window_in.first_slice_window_4D();
Window out_slice = window.first_slice_window_4D();
@@ -220,8 +243,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_4D_tensor_argument(idx, _input, in_slice);
add_4D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+ } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
}
break;
default:
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
index 5f36bdf113..fb3b41b0de 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -72,7 +73,11 @@ public:
* @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
* @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op);
/** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
*
@@ -84,7 +89,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 3fa8a8edaa..c88a852a44 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -23,58 +23,64 @@
*/
#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
using namespace arm_compute;
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
- if(beta != nullptr)
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+ input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+ if (beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
}
- if(gamma != nullptr)
+ if (gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
}
- if(act_info.enabled())
+ if (act_info.enabled())
{
ActivationLayerInfo::ActivationFunction act = act_info.activation();
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
- && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
- && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+ ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+ act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+ act !=
+ ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
}
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -86,14 +92,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
{
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input->element_size(), input->dimension(0));
// Configure kernel window
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
bool window_changed = false;
- if(output != nullptr)
+ if (output != nullptr)
{
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
window_changed = update_window_and_padding(win, input_access, output_access);
@@ -104,30 +111,50 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
window_changed = update_window_and_padding(win, input_access);
}
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
- : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false)
+ : _input(nullptr),
+ _output(nullptr),
+ _mean(nullptr),
+ _var(nullptr),
+ _beta(nullptr),
+ _gamma(nullptr),
+ _epsilon(0),
+ _run_in_place(false)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
- float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta,
+ const ICLTensor *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
}
-void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
- const ICLTensor *gamma,
- float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta,
+ const ICLTensor *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
- auto padding_info = get_padding_info({ input, output, mean, var, beta, gamma });
+ auto padding_info = get_padding_info({input, output, mean, var, beta, gamma});
_input = input;
_output = output;
_mean = mean;
@@ -142,13 +169,15 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
(gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
- unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+ unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
@@ -157,29 +186,33 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
// Create kernel
- _kernel = create_kernel(compile_context, "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel =
+ create_kernel(compile_context,
+ "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Set kernel static arguments
unsigned int include_output = (!_run_in_place) ? 1 : 0;
- unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
- if(_beta != nullptr)
+ unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() +
+ 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+ if (_beta != nullptr)
{
idx += num_arguments_per_1D_tensor(); // Skip beta parameter
}
- if(_gamma != nullptr)
+ if (_gamma != nullptr)
{
idx += num_arguments_per_1D_tensor(); // Skip gamma parameter
}
_kernel.setArg<cl_float>(idx++, _epsilon);
- if(output != nullptr)
+ if (output != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output->info(), *input->info()->clone());
}
// Configure kernel window
- if(input->info()->data_layout() == DataLayout::NHWC)
+ if (input->info()->data_layout() == DataLayout::NHWC)
{
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
ICLKernel::configure_internal(win);
@@ -205,18 +238,23 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
_config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
}
-Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
const bool run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
- if(input->data_layout() != DataLayout::NHWC)
+ if (input->data_layout() != DataLayout::NHWC)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
+ .first);
}
return Status{};
@@ -236,11 +274,11 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor();
add_1D_tensor_argument(idx, _mean, vector_slice);
add_1D_tensor_argument(idx, _var, vector_slice);
- if(_beta != nullptr)
+ if (_beta != nullptr)
{
add_1D_tensor_argument(idx, _beta, vector_slice);
}
- if(_gamma != nullptr)
+ if (_gamma != nullptr)
{
add_1D_tensor_argument(idx, _gamma, vector_slice);
}
@@ -249,11 +287,10 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
{
idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- if(!_run_in_place)
+ if (!_run_in_place)
{
add_3D_tensor_argument(idx, _output, slice);
}
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ } while (window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index acbe0f2a26..1a88d2a8c5 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -64,7 +65,13 @@ public:
* @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
*/
- void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+ void configure(ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta = nullptr,
+ const ICLTensor *gamma = nullptr,
+ float epsilon = 0.001f,
ActivationLayerInfo act_info = ActivationLayerInfo());
/** Set the input and output tensors.
*
@@ -82,8 +89,15 @@ public:
* @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
- const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta = nullptr,
+ const ICLTensor *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
*
* @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
@@ -99,10 +113,14 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
- float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta = nullptr,
+ const ITensorInfo *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index 143a842d02..c640b5a8d6 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -25,13 +25,14 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
-#include "arm_compute/core/TensorInfo.h"
using namespace arm_compute::misc::shape_calculator;
namespace arm_compute
@@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
return Status{};
}
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status validate_arguments_static(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -66,10 +71,11 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
- const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+ const TensorShape expected_output_shape = compute_batch_to_space_shape(
+ input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+ const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -79,8 +85,7 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
}
} // namespace
-CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel()
- : _input(nullptr), _block_shape(nullptr), _output(nullptr)
+CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() : _input(nullptr), _block_shape(nullptr), _output(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -90,11 +95,14 @@ void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTenso
configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
}
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- auto padding_info = get_padding_info({ input, block_shape, output });
+ auto padding_info = get_padding_info({input, block_shape, output});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
@@ -106,8 +114,9 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
- _kernel = create_kernel(compile_context, "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
-
+ _kernel = create_kernel(compile_context,
+ "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
@@ -116,47 +125,65 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
}
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output,
- const CropInfo &crop_info)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+ const TensorShape output_shape = compute_batch_to_space_shape(
+ input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
_input = input;
_output = output;
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
build_opts.add_option("-DCROP_LEFT=" + support::cpp11::to_string(crop_info.left));
build_opts.add_option("-DCROP_TOP=" + support::cpp11::to_string(crop_info.top));
- _kernel = create_kernel(compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(
+ compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
ICLKernel::configure_internal(win);
}
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
return Status{};
}
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
@@ -185,7 +212,7 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
unsigned int idx = 0;
add_4D_tensor_argument(idx, _input, slice_in);
add_argument(idx, batch_id);
- if(_block_shape != nullptr)
+ if (_block_shape != nullptr)
{
add_1D_tensor_argument(idx, _block_shape, vector_slice);
}
@@ -193,7 +220,6 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
enqueue(queue, *this, slice_out, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
index a05184cd5b..b9d3e66fe2 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -65,7 +66,10 @@ public:
*
* @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ ICLTensor *output);
/** Initialise the kernel's inputs and output (Static block shape).
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -74,7 +78,11 @@ public:
* @param[out] output Tensor output. Data types supported: same as @p input
* @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
*/
- void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info);
+ void configure(const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info);
/** Initialise the kernel's inputs and output (Static block shape).
*
* @param[in] compile_context The compile context to be used.
@@ -84,7 +92,12 @@ public:
* @param[out] output Tensor output. Data types supported: same as @p input
* @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -106,7 +119,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info);
+ static Status validate(const ITensorInfo *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp
index 11e6d021a5..de3fb43de8 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseKernel.cpp
@@ -28,25 +28,29 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
-CLBitwiseKernel::CLBitwiseKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLBitwiseKernel::CLBitwiseKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op)
+void CLBitwiseKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ BitwiseOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
- if(op != BitwiseOperation::NOT)
+ if (op != BitwiseOperation::NOT)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
@@ -56,7 +60,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*(output->info()), *(input1->info()));
- auto padding_info = get_padding_info({ input1, input2, output });
+ auto padding_info = get_padding_info({input1, input2, output});
// Configure kernel window
const unsigned int vec_size_x = adjust_vec_size(16 / output->info()->element_size(), output->info()->dimension(0));
@@ -68,7 +72,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
// Create kernel
std::string kernel_name = "";
- switch(op)
+ switch (op)
{
case BitwiseOperation::AND:
kernel_name = "bitwise_and";
@@ -107,13 +111,12 @@ void CLBitwiseKernel::run(const Window &window, cl::CommandQueue &queue)
{
unsigned int idx = 0;
add_2D_tensor_argument(idx, _input1, slice);
- if(_input2 != nullptr)
+ if (_input2 != nullptr)
{
add_2D_tensor_argument(idx, _input2, slice);
}
add_2D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
+ } while (window.slide_window_slice_2D(slice));
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBitwiseKernel.h b/src/core/CL/kernels/CLBitwiseKernel.h
index c5a999643d..2c74955ae4 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.h
+++ b/src/core/CL/kernels/CLBitwiseKernel.h
@@ -59,7 +59,11 @@ public:
* @param[out] output Destination tensor. Data types supported: U8.
* @param[in] op Bitwise operation to perform. Supported: AND, OR, NOT, XOR.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ BitwiseOperation op);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 72de854afb..f32c518e29 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -40,7 +41,10 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes);
@@ -53,7 +57,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16;
- if(is_qasymm16)
+ if (is_qasymm16)
{
const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
@@ -65,12 +69,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
}
- if(pred_boxes->total_size() > 0)
+ if (pred_boxes->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes);
ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
- if(is_qasymm16)
+ if (is_qasymm16)
{
const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f);
@@ -83,22 +87,31 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
}
} // namespace
-CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel()
- : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
+CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
}
-void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
- auto padding_info = get_padding_info({ boxes, pred_boxes, deltas });
- auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+ auto padding_info = get_padding_info({boxes, pred_boxes, deltas});
+ auto_init_if_empty(*pred_boxes->info(), deltas->info()
+ ->clone()
+ ->set_data_type(boxes->info()->data_type())
+ .set_quantization_info(boxes->info()->quantization_info()));
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
@@ -128,7 +141,7 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale()));
build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1");
- if(is_quantized)
+ if (is_quantized)
{
build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type()));
const UniformQuantizationInfo boxes_qinfo = boxes->info()->quantization_info().uniform();
@@ -148,12 +161,15 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
// Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor
const unsigned int num_elems_processed_per_iteration = 4;
- Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
ICLKernel::configure_internal(win);
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
return Status{};
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
index 08f350e86a..9a1bb49bb9 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
@@ -58,7 +58,10 @@ public:
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
*
*/
- void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+ void configure(const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -71,7 +74,11 @@ public:
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
*
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
*
@@ -85,7 +92,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+ static Status validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index a2a0bc4fb4..ec58bf9e7a 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -46,15 +47,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
- const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+ const unsigned int channels =
+ input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ num_groups == channels,
+ "Channel shuffling with same number of groups as number of channels would be inefficient");
// There cannot be more groups than channels
ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+ "The number of channels must be a multiple of the number of groups");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -70,11 +75,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
auto_init_if_empty(*output, *input->clone());
const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
- if(is_nhwc)
+ if (is_nhwc)
{
- unsigned int num_elems_processed_per_iteration_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
- Window win_collapsed = win.collapse(win, Window::DimZ);
+ unsigned int num_elems_processed_per_iteration_x =
+ adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
+ Window win_collapsed = win.collapse(win, Window::DimZ);
return std::make_pair(Status{}, win_collapsed);
}
else
@@ -83,22 +89,25 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
constexpr unsigned int num_elems_processed_per_iteration_y = 2;
// Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ Window win = calculate_max_window(
+ *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
+ num_elems_processed_per_iteration_y);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x,
+ num_elems_processed_per_iteration_y);
const bool window_changed = update_window_and_padding(win, input_access, output_access);
Window win_collapsed = win.collapse(win, Window::DimZ);
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win_collapsed);
}
}
} // namespace
-CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel()
- : _input(nullptr), _output(nullptr)
+CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -108,23 +117,27 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
}
-void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
- const DataLayout data_layout = input->info()->data_layout();
- const bool is_nhwc = data_layout == DataLayout::NHWC;
- const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
- unsigned int vec_size_x = 0;
- unsigned int vec_size_x_leftovers = 0;
- if(is_nhwc)
+ const DataLayout data_layout = input->info()->data_layout();
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
+ const unsigned int channels =
+ input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+ unsigned int vec_size_x = 0;
+ unsigned int vec_size_x_leftovers = 0;
+ if (is_nhwc)
{
- vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+ vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
}
else
@@ -170,13 +183,14 @@ void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_cont
_config_id += support::cpp11::to_string(output->info()->dimension(1));
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(2));
- if(data_layout == DataLayout::NHWC)
+ if (data_layout == DataLayout::NHWC)
{
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
}
-Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
index 31c007f17e..43c939ebd8 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
@@ -60,7 +60,10 @@ public:
* @param[out] output Output tensor. Data type supported: Same as @p input
* @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int num_groups);
/** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
*
* @param[in] input Input tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index f4d6316517..f27270733e 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -38,14 +39,10 @@ namespace arm_compute
namespace
{
// Create supported comparisons map
-const std::map<ComparisonOperation, std::string> supported_comparison_ops =
-{
- { ComparisonOperation::Equal, "EQUAL" },
- { ComparisonOperation::NotEqual, "NOTEQUAL" },
- { ComparisonOperation::Greater, "GREATER" },
- { ComparisonOperation::GreaterEqual, "GREATEREQUAL" },
- { ComparisonOperation::Less, "LESS" },
- { ComparisonOperation::LessEqual, "LESSEQUAL" },
+const std::map<ComparisonOperation, std::string> supported_comparison_ops = {
+ {ComparisonOperation::Equal, "EQUAL"}, {ComparisonOperation::NotEqual, "NOTEQUAL"},
+ {ComparisonOperation::Greater, "GREATER"}, {ComparisonOperation::GreaterEqual, "GREATEREQUAL"},
+ {ComparisonOperation::Less, "LESS"}, {ComparisonOperation::LessEqual, "LESSEQUAL"},
};
int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
@@ -53,7 +50,10 @@ int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
return 16 / input.element_size();
}
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation)
+Status validate_arguments(const ITensorInfo &input1,
+ const ITensorInfo &input2,
+ const ITensorInfo &output,
+ ComparisonOperation operation)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN);
@@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
// Validate in case of configured output
- if(output.total_size() > 0)
+ if (output.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
@@ -76,7 +76,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
{
- const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+ const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1);
// Auto initialize output if not initialized
@@ -90,27 +90,34 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITe
AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
-CLComparisonKernel::CLComparisonKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLComparisonKernel::CLComparisonKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
}
-void CLComparisonKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation));
@@ -129,10 +136,11 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
// Set kernel build options
std::set<std::string> build_opts;
build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
+ build_opts.emplace("-DVEC_SIZE=" +
+ support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
build_opts.emplace("-DOP=" + operation_name);
build_opts.emplace("-DOP_NAME=" + lower_string(operation_name));
- if(is_data_type_quantized(input1->info()->data_type()))
+ if (is_data_type_quantized(input1->info()->data_type()))
{
const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
@@ -160,12 +168,16 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
_config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
}
-Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparisonKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ComparisonOperation operation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
return Status{};
}
@@ -181,17 +193,18 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
bool can_collapse = true;
const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
- if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
{
can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
{
can_collapse = (in_shape1[d] == in_shape2[d]);
}
}
bool has_collapsed = false;
- Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -212,16 +225,16 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
BorderSize CLComparisonKernel::border_size() const
{
const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info());
- const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize{ 0, border, 0, 0 };
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize{0, border, 0, 0};
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h
index 0b94190183..174a6c9bf9 100644
--- a/src/core/CL/kernels/CLComparisonKernel.h
+++ b/src/core/CL/kernels/CLComparisonKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLCOMPARISONKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -64,7 +65,11 @@ public:
* @param[out] output Destination tensor. Data types supported: U8.
* @param[in] operation Comparison operation to use.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation);
/** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
*
* @param[in] input1 Source tensor. Data types supported: All.
@@ -74,10 +79,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ComparisonOperation operation);
// Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
+ void run(const Window &window, cl::CommandQueue &queue) override;
BorderSize border_size() const override;
private:
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index 76af5d564a..f8ecc4c098 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -29,6 +29,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -40,7 +41,8 @@ CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
_type = CLKernelType::ELEMENTWISE;
}
-Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
const PadStrideInfo &info)
{
ARM_COMPUTE_UNUSED(info);
@@ -60,7 +62,7 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
- for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+ for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
}
@@ -68,20 +70,21 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
return Status{};
}
-void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
- const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
}
-void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
- const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PadStrideInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
@@ -119,7 +122,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
const int out_end_y = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
const int out_step_y = _info.stride().second;
- switch(_data_layout)
+ switch (_data_layout)
{
case DataLayout::NCHW:
{
@@ -137,8 +140,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
add_3D_tensor_argument(idx, _input, slice_in);
add_3D_tensor_argument(idx, _output, slice_out);
enqueue(queue, *this, slice_out, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
+ } while (collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
break;
}
case DataLayout::NHWC:
@@ -156,8 +158,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
add_3D_tensor_argument(idx, _input, slice_in);
add_3D_tensor_argument(idx, _output, slice_out);
enqueue(queue, *this, slice_out, lws_hint());
- }
- while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
break;
}
default:
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index e0d1322341..762989a836 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -62,7 +62,10 @@ public:
* @param[out] output Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] info Contains padding and stride information described in @ref PadStrideInfo.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PadStrideInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
*
* @param[in] input Source tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index 0fc0ff8168..b33e0a8b6f 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -27,9 +27,10 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -38,7 +39,11 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
@@ -53,19 +58,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first);
ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32);
- if(!is_qasymm)
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S32);
+ if (!is_qasymm)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info);
}
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) *
+ weights_info->dimension(idx_b));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b));
- if(bias != nullptr)
+ if (bias != nullptr)
{
- if(is_qasymm)
+ if (is_qasymm)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
}
@@ -76,19 +83,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b));
}
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
- auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+ auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+ weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+ stride_info);
- const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+ const TensorShape output_shape =
+ misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
}
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input,
+ ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -97,11 +111,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
- auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+ auto out_dims =
+ deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+ weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
- const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+ const TensorShape output_shape =
+ misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info()));
+ auto_init_if_empty(*output, input->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_layout(data_layout)
+ .set_quantization_info(input->quantization_info()));
Window win = calculate_max_window(*input);
@@ -109,29 +129,37 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
}
} // namespace
-CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel()
- : _add_bias(false),
- _bias(nullptr)
+CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() : _add_bias(false), _bias(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
const PadStrideInfo &deconv_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, input_info, weights_info, deconv_info);
}
-void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info,
- const ITensorInfo *weights_info,
- const PadStrideInfo &deconv_info)
+void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr),
+ output->info(), input_info, weights_info, deconv_info));
- auto padding_info = get_padding_info({ input, bias, output });
+ auto padding_info = get_padding_info({input, bias, output});
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
+ auto win_config =
+ validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
const DataLayout data_layout = input_info->data_layout();
@@ -178,7 +206,11 @@ void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compi
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info));
@@ -194,7 +226,7 @@ void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQu
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, collapsed);
add_3D_tensor_argument(idx, _output, collapsed);
- if(_add_bias)
+ if (_add_bias)
{
add_1D_tensor_argument(idx, _bias, collapsed);
}
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
index ce354fa86f..8f436b07e3 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
@@ -67,7 +67,12 @@ public:
* @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input.
* @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
*/
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+ void configure(const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info);
/** Initialise the kernel's source and destination.
*
* @param[in] compile_context The compile context to be used.
@@ -79,8 +84,13 @@ public:
* @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input.
* @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
- const PadStrideInfo &deconv_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionReshapeOutputKernel.
*
@@ -93,7 +103,12 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index 5c1dc4fbf6..cdf19ab2e1 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -49,12 +50,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+ (block_shape * input->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+ (block_shape * input->tensor_shape()[idx_height]));
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -63,8 +66,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
}
} // namespace
-CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape()
+CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -74,14 +76,18 @@ void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *out
configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
}
-void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+ TensorShape output_shape =
+ compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
@@ -98,7 +104,9 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
- _kernel = create_kernel(compile_context, "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(compile_context,
+ "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
@@ -137,7 +145,6 @@ void CLDepthToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
enqueue(queue, *this, slice_in, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_in));
+ } while (window.slide_window_slice_3D(slice_in));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
index 1f7f77b569..cef70c4dda 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -61,7 +62,8 @@ public:
* @param[out] output Tensor output. Data types supported: same as @p input
* @param[in] block_shape Block shape value.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+ void
+ configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index e34b6929e7..b95abe795f 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -23,16 +23,17 @@
*/
#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLUtils.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/CL/ICLKernel.h"
@@ -45,12 +46,18 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info,
- const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
{
ARM_COMPUTE_UNUSED(dwc_info);
bool in_place = false;
- if(output == nullptr || output == input)
+ if (output == nullptr || output == input)
{
in_place = true;
output = input;
@@ -58,11 +65,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1);
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1);
ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_to_cl_image(weights) == false), "Weights cannot be exported to cl_image!");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) &&
+ (export_to_cl_image(weights) == false),
+ "Weights cannot be exported to cl_image!");
ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0));
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
@@ -72,33 +82,40 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * conv_info.depth_multiplier));
// In place restrictions
- if(in_place)
+ if (in_place)
{
- const int weights_width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
- const int weights_height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || weights->tensor_shape()[weights_height_idx] != 1U);
+ const int weights_width_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ const int weights_height_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U ||
+ weights->tensor_shape()[weights_height_idx] != 1U);
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier != 1U);
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride() != std::make_pair(1U, 1U));
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size2D(1U, 1U));
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ conv_info.pad_stride_info
+ .has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
}
- const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation };
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
+ const ConvolutionInfo info{conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(),
+ conv_info.dilation};
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
- if(conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
+ if (conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
{
ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0);
}
const bool is_quantized = is_data_type_quantized(input->data_type());
- if(biases != nullptr)
+ if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]);
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- if(is_quantized)
+ if (is_quantized)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
}
@@ -108,7 +125,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
}
}
- if(is_quantized)
+ if (is_quantized)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
@@ -116,7 +133,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
- if(is_data_type_quantized_per_channel(weights->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0));
@@ -134,22 +151,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
}
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
- if(is_data_type_quantized(input->data_type()))
+ if (is_data_type_quantized(input->data_type()))
{
const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
+ const UniformQuantizationInfo oq_info =
+ (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
int output_multiplier = 0;
int output_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
}
return Status{};
@@ -171,30 +190,48 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel
_type = CLKernelType::DEPTHWISE;
}
-void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers,
+ const ICLTensor *output_shifts)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info,
+ output_multipliers, output_shifts);
}
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers,
+ const ICLTensor *output_shifts)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- if(output == nullptr)
+ if (output == nullptr)
{
// In-place
output = input;
}
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
- dwc_info, conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+ input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_info,
+ conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
+ (output_shifts != nullptr) ? output_shifts->info() : nullptr));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info);
- auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+ *(input->info()), *(weights->info()), conv_info);
+ auto_init_if_empty(*(output->info()), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_quantization_info(output->info()->quantization_info()));
_input = input;
_output = output;
@@ -214,12 +251,12 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
CLBuildOptions build_opts;
// Update the padding for the input/weights tensor if we can export to cl_image
- if(_export_input_to_cl_image)
+ if (_export_input_to_cl_image)
{
arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info());
}
- if(_export_weights_to_cl_image)
+ if (_export_weights_to_cl_image)
{
arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info());
}
@@ -229,9 +266,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
const auto act_function = conv_info.act_info.activation();
const auto dst_data_type = _output->info()->data_type();
- if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
- && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+ if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+ (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+ act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+ (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
{
// -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
// to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -268,23 +306,24 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
- build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", "-DN0_A=" + support::cpp11::to_string(n0));
+ build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1",
+ "-DN0_A=" + support::cpp11::to_string(n0));
build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0));
build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
// Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
- set_unroll_with_pragma(build_opts, { static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
- static_cast<int>(_weights->info()->dimension(1)),
- static_cast<int>(_weights->info()->dimension(2))
- });
+ set_unroll_with_pragma(build_opts, {static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
+ static_cast<int>(_weights->info()->dimension(1)),
+ static_cast<int>(_weights->info()->dimension(2))});
- if(biases != nullptr)
+ if (biases != nullptr)
{
build_opts.add_option(std::string("-DHAS_BIAS"));
- build_opts.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
+ build_opts.add_option(
+ std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
}
- if(_is_quantized)
+ if (_is_quantized)
{
kernel_name = "dwc_native_quantized_nhwc";
const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
@@ -306,13 +345,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
- build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" + get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
- build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" + get_cl_type_from_data_type(_output_shifts->info()->data_type()));
- build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL, "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
+ build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" +
+ get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
+ build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" +
+ get_cl_type_from_data_type(_output_shifts->info()->data_type()));
+ build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL,
+ "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
// Note: We expect the input and output tensors to always adopt a per-tensor quantization approach
int a_val{};
int b_val{};
- std::tie(b_val, a_val) = get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
+ std::tie(b_val, a_val) =
+ get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + support::cpp11::to_string(a_val));
build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + support::cpp11::to_string(b_val));
@@ -321,8 +364,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
{
kernel_name = "dwc_native_fp_nhwc";
build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
- build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
+ build_opts.add_option_if(conv_info.act_info.enabled(),
+ "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
+ build_opts.add_option_if(conv_info.act_info.enabled(),
+ "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
}
Window win = calculate_max_window(*(output->info()), Steps(n0, m0));
@@ -350,10 +395,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
_config_id += string_from_data_type(input->info()->data_type());
}
-Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
return Status{};
}
@@ -370,47 +422,52 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
cl::Image2D input_cl_image;
cl::Image2D weights_cl_image;
- if(_export_input_to_cl_image || _export_weights_to_cl_image)
+ if (_export_input_to_cl_image || _export_weights_to_cl_image)
{
// Export cl_buffer to cl_image
- if(_export_input_to_cl_image)
+ if (_export_input_to_cl_image)
{
- const size_t image_w = _input->info()->dimension(0) / 4;
- const size_t image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
+ const size_t image_w = _input->info()->dimension(0) / 4;
+ const size_t image_h =
+ _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
const TensorShape shape2d(image_w, image_h);
const size_t image_row_pitch = _input->info()->strides_in_bytes()[1];
- input_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+ input_cl_image =
+ create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d,
+ _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
}
- if(_export_weights_to_cl_image)
+ if (_export_weights_to_cl_image)
{
- const size_t image_w = _weights->info()->dimension(0) / 4;
- const size_t image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
+ const size_t image_w = _weights->info()->dimension(0) / 4;
+ const size_t image_h =
+ _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
const TensorShape shape2d(image_w, image_h);
const size_t image_row_pitch = _weights->info()->strides_in_bytes()[1];
- weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch,
- CLImage2DType::ReadOnly);
+ weights_cl_image =
+ create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d,
+ _weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
}
}
unsigned int idx = 0;
- if(_export_input_to_cl_image)
+ if (_export_input_to_cl_image)
{
_kernel.setArg(idx++, input_cl_image);
}
add_4d_tensor_nhwc_argument(idx, _input);
add_4d_tensor_nhwc_argument(idx, _output);
- if(_export_weights_to_cl_image)
+ if (_export_weights_to_cl_image)
{
_kernel.setArg(idx++, weights_cl_image);
}
add_4d_tensor_nhwc_argument(idx, _weights);
- if(_is_quantized)
+ if (_is_quantized)
{
add_1D_tensor_argument(idx, _output_multipliers, slice);
add_1D_tensor_argument(idx, _output_shifts, slice);
}
- if(_biases != nullptr)
+ if (_biases != nullptr)
{
add_1D_tensor_argument(idx, _biases, slice);
}
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 8eee7b2500..d34a662966 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -24,11 +24,11 @@
#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-#include "src/core/CL/ICLKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/function_info/ConvolutionInfo.h"
+#include "src/core/CL/ICLKernel.h"
+
namespace arm_compute
{
class ICLTensor;
@@ -74,15 +74,28 @@ public:
* * no padding
* * no change of data layout after configure
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
- const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers = nullptr,
+ const ICLTensor *output_shifts = nullptr);
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
*
* Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
*/
- void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
- const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+ void configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers = nullptr,
+ const ICLTensor *output_shifts = nullptr);
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
*
@@ -90,23 +103,29 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info,
- const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ITensorInfo *output_multipliers = nullptr,
+ const ITensorInfo *output_shifts = nullptr);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
private:
- const ICLTensor *_input {};
+ const ICLTensor *_input{};
const ICLTensor *_weights{};
const ICLTensor *_biases{};
ICLTensor *_output{};
- unsigned int _depth_multiplier{ 0 };
+ unsigned int _depth_multiplier{0};
const ICLTensor *_output_multipliers{};
const ICLTensor *_output_shifts{};
- bool _export_input_to_cl_image{ false };
- bool _export_weights_to_cl_image{ true };
- bool _is_quantized{ false };
+ bool _export_input_to_cl_image{false};
+ bool _export_weights_to_cl_image{true};
+ bool _is_quantized{false};
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index 9b514ed705..3d8f875ef7 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -37,17 +38,20 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -57,7 +61,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input,
+ ITensorInfo *output,
+ ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_UNUSED(idx, config);
@@ -69,21 +76,27 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
}
} // namespace
-CLFFTDigitReverseKernel::CLFFTDigitReverseKernel()
- : _input(nullptr), _output(nullptr), _idx(nullptr)
+CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() : _input(nullptr), _output(nullptr), _idx(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *idx,
+ const FFTDigitReverseKernelInfo &config)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, idx, config);
}
-void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
- auto padding_info = get_padding_info({ input, output, idx });
+ auto padding_info = get_padding_info({input, output, idx});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
_input = input;
@@ -114,10 +127,14 @@ void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context,
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
return Status{};
}
@@ -137,7 +154,6 @@ void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue)
add_3D_tensor_argument(idx, _output, slice);
add_1D_tensor_argument(idx, _idx, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
index e5583a4c22..fdd1bcc3d3 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.h
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
#define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
-#include "src/core/CL/ICLKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLKernel.h"
+
namespace arm_compute
{
// Forward declarations
@@ -56,7 +56,8 @@ public:
* @param[in] idx Digit reverse index tensor. Data type supported: U32
* @param[in] config Kernel configuration.
*/
- void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+ void
+ configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -65,7 +66,11 @@ public:
* @param[in] idx Digit reverse index tensor. Data type supported: U32
* @param[in] config Kernel configuration.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *idx,
+ const FFTDigitReverseKernelInfo &config);
/** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel
*
* @param[in] input Source tensor info. Data types supported: F16/F32.
@@ -75,7 +80,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 95f4b640bd..3729e6b77d 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -29,6 +29,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -46,11 +47,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -59,9 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
{
- if(output != nullptr)
+ if (output != nullptr)
{
auto_init_if_empty(*output, *input);
}
@@ -76,8 +78,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
}
} // namespace
-CLFFTRadixStageKernel::CLFFTRadixStageKernel()
- : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTRadixStageKernel::CLFFTRadixStageKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -87,11 +88,15 @@ void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const
configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
}
-void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
+void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTRadixStageKernelInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
- auto padding_info = get_padding_info({ input, output });
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
@@ -110,11 +115,12 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set static arguments if not the first stage
- if(!config.is_first_stage)
+ if (!config.is_first_stage)
{
const unsigned int Ni = config.Nx * config.radix;
const float exp_const = (-2.0 * M_PI) / static_cast<float>(Ni);
- unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ unsigned int idx =
+ (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
_kernel.setArg<cl_uint>(idx++, config.Nx);
_kernel.setArg<cl_uint>(idx++, Ni);
_kernel.setArg<cl_float>(idx, exp_const);
@@ -136,21 +142,22 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status CLFFTRadixStageKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const FFTRadixStageKernelInfo &config)
{
const bool run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
- (run_in_place) ? nullptr : output->clone().get(),
- config)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+ .first);
return Status{};
}
std::set<unsigned int> CLFFTRadixStageKernel::supported_radix()
{
- return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+ return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
}
void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -165,12 +172,11 @@ void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
{
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- if(!_run_in_place)
+ if (!_run_in_place)
{
add_3D_tensor_argument(idx, _output, slice);
}
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h
index 9bb310db83..de80bfced3 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.h
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
#define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
-#include "src/core/CL/ICLKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLKernel.h"
+
#include <set>
namespace arm_compute
@@ -69,7 +69,10 @@ public:
* @param[out] output Destination tensor. Can be nullptr. Data type supported: same as @p input
* @param[in] config FFT descriptor metadata.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTRadixStageKernelInfo &config);
/** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel
*
* @param[in] input Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index 8a714d71bf..be6e16b074 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -54,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
}
} // namespace
-CLFFTScaleKernel::CLFFTScaleKernel()
- : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTScaleKernel::CLFFTScaleKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -65,11 +65,14 @@ void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTS
configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
}
-void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
+void CLFFTScaleKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTScaleKernelInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
@@ -78,20 +81,22 @@ void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTen
// Create kernel
CLBuildOptions build_opts;
build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels()
+ : input->info()->num_channels()));
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option_if(config.conjugate, "-DCONJ");
std::string kernel_name = "fft_scale_conj";
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set static arguments
- unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ unsigned int idx =
+ (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
_kernel.setArg<cl_float>(idx, config.scale);
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
- if(output != nullptr)
+ if (output != nullptr)
{
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), *input->info()->clone());
@@ -130,12 +135,11 @@ void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue)
{
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- if(!_run_in_place)
+ if (!_run_in_place)
{
add_3D_tensor_argument(idx, _output, slice);
}
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h
index cc518be193..b995282e02 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.h
+++ b/src/core/CL/kernels/CLFFTScaleKernel.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
#define ARM_COMPUTE_CLFFTSCALEKERNEL_H
-#include "src/core/CL/ICLKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLKernel.h"
+
namespace arm_compute
{
// Forward declarations
@@ -63,7 +63,10 @@ public:
* @param[out] output Destination tensor. Data type supported: same as @p input
* @param[in] config Kernel configuration
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTScaleKernelInfo &config);
/** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel
*
* @param[in] input Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index fcd99a4ed9..86bb502da3 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -31,14 +31,14 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/StringUtils.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/WindowHelpers.h"
#include "support/Cast.h"
#include "support/StringSupport.h"
namespace arm_compute
{
-CLFillBorderKernel::CLFillBorderKernel()
- : ICLKernel(), _tensor(nullptr)
+CLFillBorderKernel::CLFillBorderKernel() : ICLKernel(), _tensor(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -56,27 +56,38 @@ void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue
ICLKernel::add_argument<T>(idx, static_cast<T>(value));
}
-void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
configure(CLKernelLibrary::get().get_compile_context(), tensor, border_size, border_mode, constant_border_value);
}
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
_tensor = tensor;
configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value);
}
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
ARM_COMPUTE_ERROR_ON(tensor == nullptr);
ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1);
- auto padding_info = get_padding_info({ tensor });
+ auto padding_info = get_padding_info({tensor});
border_size.limit(tensor->padding());
// If there is no border: early exit
- if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+ if (border_size.empty() || border_mode == BorderMode::UNDEFINED)
{
return;
}
@@ -98,25 +109,22 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Create static kernel arguments
- const unsigned int valid_width = tensor->valid_region().shape[0];
- const unsigned int valid_height = tensor->valid_region().shape[1];
- const cl_int2 valid_region_coords =
- {
- {
- static_cast<cl_int>(tensor->valid_region().anchor[0]),
- static_cast<cl_int>(tensor->valid_region().anchor[1]),
- }
- };
- const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+ const unsigned int valid_width = tensor->valid_region().shape[0];
+ const unsigned int valid_height = tensor->valid_region().shape[1];
+ const cl_int2 valid_region_coords = {{
+ static_cast<cl_int>(tensor->valid_region().anchor[0]),
+ static_cast<cl_int>(tensor->valid_region().anchor[1]),
+ }};
+ const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
// Set static kernel arguments
unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
ICLKernel::add_argument<cl_uint>(idx, valid_width);
ICLKernel::add_argument<cl_uint>(idx, valid_height);
ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
- if(BorderMode::CONSTANT == border_mode)
+ if (BorderMode::CONSTANT == border_mode)
{
- switch(dt)
+ switch (dt)
{
case DataType::U8:
case DataType::QASYMM8:
@@ -175,12 +183,13 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
{
// Border mode undefined or border width == 0
- if(_kernel() == nullptr)
+ if (_kernel() == nullptr)
{
return;
}
- const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ const auto tensor =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
@@ -193,14 +202,13 @@ void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::
unsigned int idx = 0;
add_3D_tensor_argument(idx, tensor, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
{
// Border mode undefined or border width == 0
- if(_kernel() == nullptr)
+ if (_kernel() == nullptr)
{
return;
}
@@ -216,7 +224,6 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
unsigned int idx = 0;
add_3D_tensor_argument(idx, _tensor, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h
index 7951f48171..5782143cf9 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.h
+++ b/src/core/CL/kernels/CLFillBorderKernel.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -57,7 +58,11 @@ public:
* @param[in] border_mode Border mode to use for the convolution.
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
/** Initialise the kernel's input, output and border mode.
*
* @param[in,out] tensor Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
@@ -65,7 +70,10 @@ public:
* @param[in] border_mode Border mode to use for the convolution.
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*/
- void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ void configure(ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
/** Initialise the kernel's input, output and border mode.
*
* @param[in] compile_context The compile context to be used.
@@ -74,7 +82,11 @@ public:
* @param[in] border_mode Border mode to use for the convolution.
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*/
- void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ void configure(const CLCompileContext &compile_context,
+ ITensorInfo *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
/** Function to set the constant value on fill border kernel depending on type.
*
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index 68fe324df6..7da0679ae4 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -30,20 +30,26 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -54,43 +60,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
- if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+ if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
{
ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
}
else
{
- const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t channel_idx =
+ get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
}
// Validate bias
- if(input_bias != nullptr)
+ if (input_bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
}
// Validate beta
- if(bn_beta != nullptr)
+ if (bn_beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
}
// Validate gamma
- if(bn_gamma != nullptr)
+ if (bn_gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
}
// Validate output weights
- if(fused_weights != nullptr && fused_weights->total_size() != 0)
+ if (fused_weights != nullptr && fused_weights->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
}
// Validate output bias
- if(fused_bias != nullptr && fused_bias->total_size() != 0)
+ if (fused_bias != nullptr && fused_bias->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -101,28 +108,52 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
} // namespace
CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel()
- : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
- _run_in_place_weights(false), _run_in_place_bias(false)
+ : _input_weights(nullptr),
+ _input_bias(nullptr),
+ _bn_mean(nullptr),
+ _bn_var(nullptr),
+ _bn_gamma(nullptr),
+ _bn_beta(nullptr),
+ _fused_weights(nullptr),
+ _fused_bias(nullptr),
+ _epsilon(),
+ _run_in_place_weights(false),
+ _run_in_place_bias(false)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
- ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias,
+ const ICLTensor *bn_beta,
+ const ICLTensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
-void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
- ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias,
+ const ICLTensor *bn_beta,
+ const ICLTensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
- auto padding_info = get_padding_info({ input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma });
+ auto padding_info =
+ get_padding_info({input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma});
_input_weights = input_weights;
_input_bias = input_bias;
@@ -135,28 +166,28 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
_epsilon = epsilon;
_run_in_place_weights = (fused_weights == nullptr) || (fused_weights == input_weights);
- _run_in_place_bias = (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
+ _run_in_place_bias =
+ (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
// Auto initialize outputs
- if(_fused_weights != nullptr)
+ if (_fused_weights != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
}
- if(_fused_bias != nullptr)
+ if (_fused_bias != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
}
// Validate arguments
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
- (fused_weights != nullptr) ? fused_weights->info() : nullptr,
- (fused_bias != nullptr) ? fused_bias->info() : nullptr,
- (input_bias != nullptr) ? input_bias->info() : nullptr,
- (bn_beta != nullptr) ? bn_beta->info() : nullptr,
- (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
- epsilon, fbn_type));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+ input_weights->info(), bn_mean->info(), bn_var->info(),
+ (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+ (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+ (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+ fbn_type));
// Configure kernel window
Window win = calculate_max_window(*input_weights->info());
@@ -165,7 +196,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input_weights->info()->data_type()));
- build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
+ build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION,
+ "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
build_opts.add_option_if(_input_weights->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W");
@@ -180,12 +212,19 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
return Status{};
}
@@ -202,25 +241,25 @@ void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl::
// Add kernel arguments
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input_weights, slice_3d);
- if(_input_bias != nullptr)
+ if (_input_bias != nullptr)
{
add_1D_tensor_argument(idx, _input_bias, slice_1d);
}
add_1D_tensor_argument(idx, _bn_mean, slice_1d);
add_1D_tensor_argument(idx, _bn_var, slice_1d);
- if(!_run_in_place_weights)
+ if (!_run_in_place_weights)
{
add_3D_tensor_argument(idx, _fused_weights, slice_3d);
}
- if(!_run_in_place_bias)
+ if (!_run_in_place_bias)
{
add_1D_tensor_argument(idx, _fused_bias, slice_1d);
}
- if(_bn_beta != nullptr)
+ if (_bn_beta != nullptr)
{
add_1D_tensor_argument(idx, _bn_beta, slice_1d);
}
- if(_bn_gamma != nullptr)
+ if (_bn_gamma != nullptr)
{
add_1D_tensor_argument(idx, _bn_gamma, slice_1d);
}
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
index 78b1e74cab..76ec7a759f 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
@@ -62,9 +62,16 @@ public:
* @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
* @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
*/
- void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ void configure(const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias = nullptr,
+ const ICLTensor *bn_beta = nullptr,
+ const ICLTensor *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
/** Set the source, destination of the kernel
*
* @param[in] compile_context The compile context to be used.
@@ -81,9 +88,17 @@ public:
* @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
* @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias = nullptr,
+ const ICLTensor *bn_beta = nullptr,
+ const ICLTensor *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
/** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel
*
* @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -101,10 +116,16 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ static Status validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias = nullptr,
+ const ITensorInfo *bn_beta = nullptr,
+ const ITensorInfo *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index 5495023b80..c11a18940a 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "src/core/CL/kernels/CLGatherKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -34,7 +36,8 @@ namespace arm_compute
{
namespace
{
-inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+inline Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
@@ -43,11 +46,12 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+ input->tensor_shape(), indices->tensor_shape(), actual_axis);
ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
}
@@ -56,12 +60,14 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
// Output auto initialization if not yet initialized
- TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+ input->tensor_shape(), indices->tensor_shape(), actual_axis);
auto_init_if_empty((*output), output_shape, 1, input->data_type());
// Create window
@@ -72,8 +78,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
} // namespace
-CLGatherKernel::CLGatherKernel()
- : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+CLGatherKernel::CLGatherKernel() : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -83,10 +88,14 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices,
configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
}
-void CLGatherKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGatherKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ int axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
- auto padding_info = get_padding_info({ input, output, indices });
+ auto padding_info = get_padding_info({input, output, indices});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis));
// Configure kernel window
@@ -100,7 +109,8 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
// Set build options
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2)));
build_opts.add_option("-DINDICES_DIM_Z=" + support::cpp11::to_string(indices->info()->dimension(2)));
build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
@@ -114,10 +124,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
return Status{};
}
diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h
index 8f472a4696..db4b49d2f5 100644
--- a/src/core/CL/kernels/CLGatherKernel.h
+++ b/src/core/CL/kernels/CLGatherKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLGATHERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -63,7 +64,11 @@ public:
* @param[out] output Destination tensor. Data type supported: Same as @p input
* @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ int axis = 0);
/** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
*
@@ -74,7 +79,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 088c454f3c..b9ff72b928 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
- if(all_anchors->total_size() > 0)
+ if (all_anchors->total_size() > 0)
{
size_t feature_height = info.feat_height();
size_t feature_width = info.feat_width();
@@ -57,7 +58,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
- if(is_data_type_quantized(anchors->data_type()))
+ if (is_data_type_quantized(anchors->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
}
@@ -66,21 +67,25 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
}
} // namespace
-CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel()
- : _anchors(nullptr), _all_anchors(nullptr)
+CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() : _anchors(nullptr), _all_anchors(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors,
+ ICLTensor *all_anchors,
+ const ComputeAnchorsInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info);
}
-void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *anchors,
+ ICLTensor *all_anchors,
+ const ComputeAnchorsInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors);
- auto padding_info = get_padding_info({ anchors, all_anchors });
+ auto padding_info = get_padding_info({anchors, all_anchors});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info));
// Metadata
@@ -91,7 +96,8 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
// Initialize the output if empty
const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
- auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+ auto_init_if_empty(*all_anchors->info(),
+ TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
// Set instance variables
_anchors = anchors;
@@ -108,7 +114,7 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors));
build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi()));
- if(is_quantized)
+ if (is_quantized)
{
const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
@@ -116,8 +122,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
}
// Create kernel
- const std::string kernel_name = (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+ const std::string kernel_name =
+ (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields).
// This means we don't need to pad on the X dimension, as we know in advance how many fields
@@ -127,7 +134,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors,
+ const ITensorInfo *all_anchors,
+ const ComputeAnchorsInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
return Status{};
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
index d26795ac7d..e08f281d6c 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
@@ -62,7 +62,10 @@ public:
* @param[in] info Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
*
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *anchors,
+ ICLTensor *all_anchors,
+ const ComputeAnchorsInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
*
@@ -81,5 +84,5 @@ private:
const ICLTensor *_anchors;
ICLTensor *_all_anchors;
};
-} // arm_compute
+} // namespace arm_compute
#endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index 7ed323c950..b13eb16556 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
@@ -30,6 +30,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -39,17 +40,20 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.epsilon == 0.f, "Epsilon must be different than 0");
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+ "Input and output have different number of channels");
}
return Status{};
@@ -59,27 +63,30 @@ Status validate_arguments_meanvar(const ITensorInfo *input, const ITensorInfo *o
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+ "Input and output have different number of channels");
}
return Status{};
}
} // namespace
-CLComputeMeanVariance::CLComputeMeanVariance()
- : _input(nullptr), _output(nullptr)
+CLComputeMeanVariance::CLComputeMeanVariance() : _input(nullptr), _output(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision)
+void CLComputeMeanVariance::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ bool use_mixed_precision)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output == nullptr ? input : output;
@@ -88,7 +95,8 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I
const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
CLBuildOptions build_opts;
- build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.add_option("-DINTERNAL_DATA_TYPE=" +
+ (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
@@ -108,7 +116,7 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I
const TensorShape out_shape(input_channel, 2u, input_batches);
// Output auto initialization if not yet initialized
- if(use_mixed_precision)
+ if (use_mixed_precision)
{
auto_init_if_empty(*_output->info(), out_shape, 1, DataType::F32);
}
@@ -134,7 +142,7 @@ void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue)
Window collapsed_window = window.collapse(window, Window::DimZ);
// We will process the planes together
- if(_input->info()->data_layout() == DataLayout::NCHW)
+ if (_input->info()->data_layout() == DataLayout::NCHW)
{
collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -157,10 +165,14 @@ CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel()
_type = CLKernelType::ELEMENTWISE;
}
-void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *mean_var,
+ ICLTensor *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output == nullptr ? input : output;
@@ -172,7 +184,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision
+ ? "float"
+ : get_cl_type_from_data_type(input->info()->data_type())));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
@@ -188,7 +202,7 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps(1));
- if(output != nullptr)
+ if (output != nullptr)
{
auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
}
@@ -197,7 +211,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
return Status{};
@@ -211,7 +227,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
Window collapsed_window = window.collapse(window, Window::DimZ);
// We will process the planes together
- if(_input->info()->data_layout() == DataLayout::NCHW)
+ if (_input->info()->data_layout() == DataLayout::NCHW)
{
collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -226,7 +242,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
add_4D_tensor_argument(idx, _input, collapsed_window);
add_3D_tensor_argument(idx, _mean, collapsed_window);
- if(!_run_in_place)
+ if (!_run_in_place)
{
add_4D_tensor_argument(idx, _output, collapsed_window);
}
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
index 2f9014a651..9f436da7f6 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
#define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
-#include "src/core/CL/ICLKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLKernel.h"
+
namespace arm_compute
{
// Forward declarations
@@ -59,7 +59,11 @@ public:
* @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
* @param[in] info Kernel meta-data descriptor
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *mean_var,
+ ICLTensor *output,
+ const InstanceNormalizationLayerKernelInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
*
@@ -69,7 +73,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
@@ -106,7 +111,8 @@ public:
* @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
* @param[in] use_mixed_precision Use mixed precision in case of FP16 execution
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
+ void
+ configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
/** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
*
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 542d380e4a..9ed9d7c5b0 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -31,10 +31,10 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
@@ -43,7 +43,8 @@ namespace
{
constexpr int max_input_tensor_dim = 3;
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
{
ARM_COMPUTE_UNUSED(epsilon);
@@ -53,14 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+ "Actual normalization axis greater than max number of dimensions");
// Reduce shape on axis
TensorShape sum_shape = input->tensor_shape();
sum_shape.set(actual_axis, 1);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -78,16 +80,22 @@ CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel()
_type = CLKernelType::ELEMENTWISE;
}
-void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(
+ const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
{
configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, axis, epsilon);
}
-void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *sum,
+ ICLTensor *output,
+ int axis,
+ float epsilon)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
- auto padding_info = get_padding_info({ input, sum, output });
+ auto padding_info = get_padding_info({input, sum, output});
_input = input;
_sum = sum;
@@ -95,8 +103,9 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
_actual_axis = wrap_around(axis, max_input_tensor_dim);
_epsilon = epsilon;
- const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
- const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+ const unsigned int vec_size_x =
+ adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+ const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
// Set build options
CLBuildOptions build_opts;
@@ -107,7 +116,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
// Create kernel
std::string kernel_name;
unsigned int idx = 0;
- switch(_actual_axis)
+ switch (_actual_axis)
{
case 0:
kernel_name = "l2_normalize_x";
@@ -127,7 +136,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set epsilon argument
- if(input->info()->data_type() == DataType::F32)
+ if (input->info()->data_type() == DataType::F32)
{
_kernel.setArg<cl_float>(idx, _epsilon);
}
@@ -146,7 +155,8 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status CLL2NormalizeLayerKernel::validate(
+ const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
return Status{};
@@ -159,7 +169,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
Window window_sum(window);
- switch(_actual_axis)
+ switch (_actual_axis)
{
case 0:
{
@@ -173,8 +183,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
add_2D_tensor_argument(idx, _sum, sum_slice);
add_2D_tensor_argument(idx, _output, in_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+ } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
}
break;
case 1:
@@ -189,8 +198,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
add_2D_tensor_argument(idx, _sum, sum_slice);
add_2D_tensor_argument(idx, _output, in_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+ } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
}
break;
case 2:
@@ -205,8 +213,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
add_3D_tensor_argument(idx, _sum, sum_slice);
add_3D_tensor_argument(idx, _output, in_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+ } while (window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
}
break;
default:
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
index edc0585217..5c9ab94ce5 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -70,7 +71,12 @@ public:
* @param[in] axis Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
* @param[in] epsilon Lower bound value for the normalization.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *sum,
+ ICLTensor *output,
+ int axis,
+ float epsilon);
/** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel.
*
@@ -84,7 +90,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
index dc9d68626d..e560f1de4a 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -42,26 +43,31 @@ using namespace misc::shape_calculator;
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices);
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- PoolingType pool_type = pool_info.pool_type;
- const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ PoolingType pool_type = pool_info.pool_type;
+ const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- const int pool_size_x = pool_info.pool_size.width;
- const int pool_size_y = pool_info.pool_size.height;
+ const int pool_size_x = pool_info.pool_size.width;
+ const int pool_size_y = pool_info.pool_size.height;
const Size2D pool_size(pool_size_x, pool_size_y);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+ "Pooling indices only supported for MAX pooling method");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -71,17 +77,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel()
- : _input(nullptr), _output(nullptr), _indices(nullptr)
+CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() : _input(nullptr), _output(nullptr), _indices(nullptr)
{
_type = CLKernelType::POOL;
}
-void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
- auto padding_info = get_padding_info({ input, indices, output });
+ auto padding_info = get_padding_info({input, indices, output});
_input = input;
_output = output;
@@ -119,7 +128,10 @@ void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_contex
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
@@ -140,7 +152,6 @@ void CLMaxUnpoolingLayerKernel::run(const Window &window, cl::CommandQueue &queu
add_3D_tensor_argument(idx, _output, slice);
add_3D_tensor_argument(idx, _indices, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ } while (window.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
index 45481d0507..eb18a46784 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
@@ -59,7 +59,11 @@ public:
* @param[out] output Destination tensor. Data types supported: Same as @p input.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel
*
* @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -72,7 +76,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info);
// Inherited methods overridden
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index ac33468ad8..8632bdf623 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -49,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -69,15 +70,19 @@ void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *out
configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
}
-void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ float epsilon)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
_run_in_place = (output == nullptr) || (output == input);
- ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+ ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(
+ input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
- if(output != nullptr)
+ if (output != nullptr)
{
auto_init_if_empty(*output->info(), *input->info());
}
@@ -85,7 +90,8 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
_input = input;
_output = output;
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
// Set build options
CLBuildOptions build_opts;
@@ -134,7 +140,6 @@ void CLMeanStdDevNormalizationKernel::run(const Window &window, cl::CommandQueue
add_2D_tensor_argument_if((!_run_in_place), idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
+ } while (window.slide_window_slice_2D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
index a1ba2b905e..e02a3c58a3 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
@@ -66,7 +66,10 @@ public:
* @param[out] output (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
* @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output = nullptr,
+ float epsilon = 1e-8f);
/** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
*
* @param[in] input Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index c6c4229c00..b636c485e7 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -32,6 +32,7 @@
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/StringUtils.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/AccessWindowStatic.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
@@ -53,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -63,7 +64,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, *input->clone());
@@ -71,9 +73,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
bool window_changed = false;
Window win;
const DataLayout data_layout = input->data_layout();
- if(data_layout == DataLayout::NCHW)
+ if (data_layout == DataLayout::NCHW)
{
- const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+ const unsigned int vec_size_x =
+ adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info);
const bool is_norm_across_width = norm_idx == 0;
@@ -87,15 +90,16 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
// The output has 1 right padding because of the vec_size_x.
// The input has 1 left padding because radius = 1.
// The input has 2 right padding because of radius = 1 AND because of the extra output padding
- const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0;
- const unsigned int border_width_right = is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
- const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left);
+ const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0;
+ const unsigned int border_width_right =
+ is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
+ const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left);
win = calculate_max_window(*input, Steps(vec_size_x));
// We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
// Reads can occur within the valid region of the input
- if(is_norm_across_width)
+ if (is_norm_across_width)
{
AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
window_changed = window_changed || update_window_and_padding(win, input_access);
@@ -112,13 +116,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
else
{
unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
- if(norm_info.is_cross_map())
+ if (norm_info.is_cross_map())
{
vec_size_x = 1;
}
win = calculate_max_window(*input, Steps(vec_size_x));
}
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
@@ -139,10 +144,13 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
}
-void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
@@ -152,16 +160,17 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
_input = input;
_output = output;
- const DataLayout data_layout = input->info()->data_layout();
- unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
- int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
- if(norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
+ const DataLayout data_layout = input->info()->data_layout();
+ unsigned int vec_size_x =
+ adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+ int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+ if (norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
{
vec_size_x = 1;
vec_size_x_leftovers = 0;
}
- if(data_layout == DataLayout::NCHW)
+ if (data_layout == DataLayout::NCHW)
{
const unsigned int norm_idx = get_normalization_dimension_index(data_layout, norm_info);
_is_norm_across_width = norm_idx == 0;
@@ -175,9 +184,10 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
// The output has 1 right padding because of the vec_size_x.
// The input has 1 left padding because radius = 1.
// The input has 2 right padding because of radius = 1 AND the extra output padding
- const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0;
- const unsigned int border_width_right = _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
- _border_size = BorderSize(0, border_width_right, 0, border_width_left);
+ const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0;
+ const unsigned int border_width_right =
+ _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
+ _border_size = BorderSize(0, border_width_right, 0, border_width_left);
}
const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
@@ -193,12 +203,14 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
- build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC, "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()),
+ "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC,
+ "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
// Create kernel
std::string kernel_name;
- if(norm_info.is_in_map())
+ if (norm_info.is_in_map())
{
kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout));
}
@@ -222,16 +234,19 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
_config_id += support::cpp11::to_string(input->info()->dimension(0));
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(1));
- if(data_layout == DataLayout::NHWC)
+ if (data_layout == DataLayout::NHWC)
{
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
}
-Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
+Status CLNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
return Status{};
}
@@ -251,7 +266,6 @@ void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &que
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
+ } while (window_collapsed.slide_window_slice_3D(slice));
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h
index 739a2ae9f1..5517ba6904 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -63,7 +63,10 @@ public:
* Data layouts supported: same as @p input.
* @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ NormalizationLayerInfo norm_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
*
* @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -77,7 +80,7 @@ public:
static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
// Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
+ void run(const Window &window, cl::CommandQueue &queue) override;
BorderSize border_size() const override;
private:
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index 6b0400d50e..59352a8fb7 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -31,32 +31,35 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/AccessWindowStatic.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
- const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ const unsigned int channel_idx =
+ get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -77,7 +80,8 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
bool window_changed = update_window_and_padding(win, input_access, output_access);
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
@@ -88,12 +92,19 @@ CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel()
_type = CLKernelType::ELEMENTWISE;
}
-void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
}
-void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std)
{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
@@ -102,7 +113,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output->info(), *input->info()->clone());
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
@@ -112,9 +123,10 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
const DataLayout data_layout = input->info()->data_layout();
// Get number of elements to process per iterations
- const unsigned int num_elems_processed_per_iteration = (data_layout == DataLayout::NHWC) ? adjust_vec_size(16 / input->info()->element_size(),
- input->info()->dimension(0)) :
- (16 / input->info()->element_size());
+ const unsigned int num_elems_processed_per_iteration =
+ (data_layout == DataLayout::NHWC)
+ ? adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0))
+ : (16 / input->info()->element_size());
const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
const DataType dt = input->info()->data_type();
@@ -122,11 +134,12 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
CLBuildOptions build_opts;
build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
- build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
+ build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx))));
std::string kernel_name = "normalize_planar_yuv_layer_";
- if(is_data_type_quantized(dt))
+ if (is_data_type_quantized(dt))
{
const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)));
@@ -139,7 +152,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Configure kernel window
- if(data_layout == DataLayout::NHWC)
+ if (data_layout == DataLayout::NHWC)
{
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
ICLKernel::configure_internal(win);
@@ -165,12 +178,16 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
_config_id += support::cpp11::to_string(input->info()->dimension(2));
}
-Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *std)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
- if(input->data_layout() == DataLayout::NCHW)
+ if (input->data_layout() == DataLayout::NCHW)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
}
return Status{};
}
@@ -196,7 +213,6 @@ void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
index 6db4433e78..341b404e3d 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
@@ -67,7 +67,11 @@ public:
* @param[in] std Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
* Data types supported: same as @p input
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std);
/** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
*
* @param[in] input Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
@@ -79,7 +83,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 53f313c0d3..0ac285038e 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -35,25 +36,29 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_UNUSED(constant_value);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON((padding.size() < 1) || (padding.size() > input->num_dimensions()));
- if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+ if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
{
ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
- for(size_t i = 0; i < padding.size(); ++i)
+ for (size_t i = 0; i < padding.size(); ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
}
}
- if(output->total_size() > 0)
+ if (output->total_size() > 0)
{
TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
@@ -65,41 +70,51 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-CLPadLayerKernel::CLPadLayerKernel()
- : _input(nullptr), _output(nullptr), _4d_enabled(false)
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _4d_enabled(false)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(
+ const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
}
-void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
+ auto_init_if_empty(*output->info(),
+ input->info()->clone()->set_tensor_shape(
+ misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
_4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
// Set build options
- const DataType &data_type = input->info()->data_type();
- const unsigned int input_width = input->info()->dimension(0);
- const unsigned int input_height = input->info()->dimension(1);
- const unsigned int input_depth = input->info()->dimension(2);
- const unsigned int pad_x_before = padding.at(0).first;
- const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
- const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
- const unsigned int vec_size = adjust_vec_size(std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))), input_width);
- const unsigned int pad_right_start = input_width + pad_x_before;
- const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
- const unsigned int vec_size_leftover_write = vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
+ const DataType &data_type = input->info()->data_type();
+ const unsigned int input_width = input->info()->dimension(0);
+ const unsigned int input_height = input->info()->dimension(1);
+ const unsigned int input_depth = input->info()->dimension(2);
+ const unsigned int pad_x_before = padding.at(0).first;
+ const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+ const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+ const unsigned int vec_size = adjust_vec_size(
+ std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))),
+ input_width);
+ const unsigned int pad_right_start = input_width + pad_x_before;
+ const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+ const unsigned int vec_size_leftover_write =
+ vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -108,12 +123,12 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder));
build_opts.add_option("-DVEC_SIZE_LEFTOVER_WRITE=" + support::cpp11::to_string(vec_size_leftover_write));
- if(padding.size() > 1)
+ if (padding.size() > 1)
{
build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
- if(padding.size() > 2)
+ if (padding.size() > 2)
{
build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
@@ -121,23 +136,25 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
}
std::string kernel_name = "pad_layer_";
- switch(mode)
+ switch (mode)
{
case PaddingMode::CONSTANT:
{
kernel_name += "constant";
- const unsigned int vec_size_leftover_read = vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
+ const unsigned int vec_size_leftover_read =
+ vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
build_opts.add_option("-DVEC_SIZE_LEFTOVER_READ=" + support::cpp11::to_string(vec_size_leftover_read));
- if(pad_x_before >= vec_size)
+ if (pad_x_before >= vec_size)
{
build_opts.add_option("-DTHREADS_TO_SKIP_BEFORE=" + support::cpp11::to_string(pad_x_before / vec_size));
- build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + support::cpp11::to_string(pad_right_start / vec_size));
+ build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" +
+ support::cpp11::to_string(pad_right_start / vec_size));
}
- if(_4d_enabled)
+ if (_4d_enabled)
{
build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
@@ -154,14 +171,17 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect;
- const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+ const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder));
- build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
- build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+ build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" +
+ support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+ build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" +
+ support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
- build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
+ build_opts.add_option_if(after_pad_fact_x < output_last_x,
+ "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
break;
}
@@ -179,7 +199,11 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
return Status{};
@@ -197,13 +221,12 @@ void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
- if(_4d_enabled)
+ if (_4d_enabled)
{
add_argument<unsigned int>(idx, batch++);
}
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ } while (window.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h
index 90af337f94..dca121b6a1 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.h
+++ b/src/core/CL/kernels/CLPadLayerKernel.h
@@ -56,7 +56,11 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
* or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
*/
- void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+ void configure(const ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
/** Set the input and output tensor.
*
* @param[in] compile_context The compile context to be used.
@@ -68,8 +72,12 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
* or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
- PaddingMode mode = PaddingMode::CONSTANT);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
/** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
*
* @param[in] input Source tensor info. Data types supported: All.
@@ -80,7 +88,11 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
* or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index bf1b874dd0..7dcdf1de6f 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -30,10 +30,10 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,10 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -51,10 +54,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
// Check variances
const int var_size = info.variances().size();
- if(var_size > 1)
+ if (var_size > 1)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
- for(int i = 0; i < var_size; ++i)
+ for (int i = 0; i < var_size; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
}
@@ -62,17 +65,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
- if(!info.max_sizes().empty())
+ if (!info.max_sizes().empty())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+ "Max and min sizes dimensions should match");
}
- for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+ for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+ "Max size should be greater than min size");
}
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
}
@@ -80,7 +85,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ ITensorInfo *output,
+ const PriorBoxLayerInfo &info,
+ int num_priors)
{
ARM_COMPUTE_UNUSED(input2);
// Output tensor auto initialization if not yet initialized
@@ -88,10 +97,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
auto_init_if_empty(*output, output_shape, 1, input1->data_type());
const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
bool window_changed = update_window_and_padding(win, output_access);
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
@@ -102,13 +112,25 @@ CLPriorBoxLayerKernel::CLPriorBoxLayerKernel()
_type = CLKernelType::ELEMENTWISE;
}
-void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info, min, max, aspect_ratios);
}
-void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min,
- cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
@@ -135,7 +157,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
int img_width = info.img_size().x;
int img_height = info.img_size().y;
- if(img_width == 0 || img_height == 0)
+ if (img_width == 0 || img_height == 0)
{
img_width = input2->info()->dimension(width_idx);
img_height = input2->info()->dimension(height_idx);
@@ -143,7 +165,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
float step_x = info.steps()[0];
float step_y = info.steps()[0];
- if(step_x == 0.f || step_y == 0.f)
+ if (step_x == 0.f || step_y == 0.f)
{
step_x = static_cast<float>(img_width) / layer_width;
step_y = static_cast<float>(img_height) / layer_height;
@@ -162,18 +184,20 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset()));
build_opts.add_option_if(info.clip(), "-DIN_PLACE");
- if(info.variances().size() > 1)
+ if (info.variances().size() > 1)
{
- for(unsigned int i = 0; i < info.variances().size(); ++i)
+ for (unsigned int i = 0; i < info.variances().size(); ++i)
{
- build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i)));
+ build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(info.variances().at(i)));
}
}
else
{
- for(unsigned int i = 0; i < 4; ++i)
+ for (unsigned int i = 0; i < 4; ++i)
{
- build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0)));
+ build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(info.variances().at(0)));
}
}
@@ -194,13 +218,17 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
ICLKernel::configure_internal(win_config.second);
}
-Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(),
+ output->clone().get(), info, num_priors)
+ .first);
return Status{};
}
@@ -211,8 +239,9 @@ void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data());
- queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data());
- if(!_info.max_sizes().empty())
+ queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float),
+ _info.aspect_ratios().data());
+ if (!_info.max_sizes().empty())
{
queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
}
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
index 6c369a7a4e..a50e0c5ff5 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.h
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
@@ -57,7 +57,13 @@ public:
* @param[in] max Maximum prior box values
* @param[in] aspect_ratios Aspect ratio values
*/
- void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios);
+ void configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -69,8 +75,14 @@ public:
* @param[in] max Maximum prior box values
* @param[in] aspect_ratios Aspect ratio values
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max,
- cl::Buffer *aspect_ratios);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios);
/** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel
*
* @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
@@ -80,14 +92,17 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
private:
- const ICLTensor *_input1;
- const ICLTensor *_input2;
+ const ICLTensor *_input1;
+ const ICLTensor *_input2;
ICLTensor *_output;
PriorBoxLayerInfo _info;
int _num_priors;
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index bd573e54c8..731fcb8e04 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
@@ -22,10 +22,12 @@
* SOFTWARE.
*/
#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -49,14 +51,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
/* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */
/* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */
- const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration;
+ const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration)
+ ? input->dimension(0)
+ : temp_num_elems_processed_per_iteration;
// This kernel doesn't need padding
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
return std::make_pair(Status{}, win);
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *weight,
+ const ITensorInfo *bias)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output);
@@ -72,7 +79,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -87,10 +94,14 @@ CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel()
_type = CLKernelType::ELEMENTWISE;
}
-void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *weight,
+ const ICLTensor *bias)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
- auto padding_info = get_padding_info({ input, weight, bias, output });
+ auto padding_info = get_padding_info({input, weight, bias, output});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info()));
@@ -104,7 +115,8 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
int32_t output_multiplier{};
int32_t output_shift{};
const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform();
- const Status status = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
+ const Status status =
+ quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
output_shift *= -1;
// Set build options
@@ -114,8 +126,12 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
- build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
- build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+ build_opts.add_option("-DMIN_BOUND=" +
+ support::cpp11::to_string(std::get<0>(
+ quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+ build_opts.add_option("-DMAX_BOUND=" +
+ support::cpp11::to_string(std::get<1>(
+ quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
// Create kernel
_kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options());
@@ -135,12 +151,18 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *weight,
+ const ICLTensor *bias)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias);
}
-Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *weight,
+ const ITensorInfo *bias)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
@@ -171,7 +193,6 @@ void CLQLSTMLayerNormalizationKernel::run(const Window &window, cl::CommandQueue
add_2D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
+ } while (window.slide_window_slice_2D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
index 31085c37ba..ba912e1d2d 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
@@ -63,7 +63,11 @@ public:
* @param[in] weight Weight tensor. Data types supported: Same as @p input.
* @param[in] bias Bias tensor. Data types supported: S32.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *weight,
+ const ICLTensor *bias);
/** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel
*
* @param[in] input Source tensor info with 2 dimensions. Data types supported: QSYMM16.
@@ -73,7 +77,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 69a6fa5fa0..c97910ef79 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -42,24 +43,29 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+ output->tensor_shape());
}
- if(is_data_type_quantized_asymmetric(input->data_type()))
+ if (is_data_type_quantized_asymmetric(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
@@ -82,12 +88,19 @@ CLROIAlignLayerKernel::CLROIAlignLayerKernel()
_type = CLKernelType::ELEMENTWISE;
}
-void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
}
-void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
@@ -97,7 +110,7 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
output->info()->set_data_layout(input->info()->data_layout());
- auto padding_info = get_padding_info({ input, rois, output });
+ auto padding_info = get_padding_info({input, rois, output});
_input = input;
_output = output;
@@ -111,16 +124,23 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH))));
- build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
- build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
+ build_opts.add_option("-DMAX_DIM_X=" +
+ support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+ input->info()->data_layout(), DataLayoutDimension::WIDTH))));
+ build_opts.add_option("-DMAX_DIM_Y=" +
+ support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+ input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
+ build_opts.add_option("-DMAX_DIM_Z=" +
+ support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+ input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
- build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
+ build_opts.add_option_if(pool_info.sampling_ratio() > 0,
+ "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
- if(is_qasymm)
+ if (is_qasymm)
{
const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
const UniformQuantizationInfo roisq_info = rois->info()->quantization_info().uniform();
@@ -144,7 +164,10 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
return Status{};
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h
index 5284a5913f..2e84e5d303 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.h
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h
@@ -61,7 +61,8 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void
+ configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -77,7 +78,11 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel
*
* @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -93,7 +98,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue);
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index f6933c6cfd..1b2c414a49 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -48,7 +49,10 @@ CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
_type = CLKernelType::ELEMENTWISE;
}
-Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
@@ -61,10 +65,11 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+ ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+ (output->dimension(1) != pool_info.pooled_height()));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
}
@@ -72,20 +77,30 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor
return Status{};
}
-void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
}
-void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ const ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
- ARM_COMPUTE_ERROR_THROW_ON(CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
- auto padding_info = get_padding_info({ input, rois, output });
+ auto padding_info = get_padding_info({input, rois, output});
// Output auto initialization if not yet initialized
- TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
- auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(), output->info()->quantization_info());
+ TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+ rois->info()->dimension(1));
+ auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(),
+ output->info()->quantization_info());
// Set instance variables
_input = input;
@@ -107,11 +122,12 @@ void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context,
build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale()));
- if(is_qasymm)
+ if (is_qasymm)
{
// Determine quantization info scale, offset
UniformQuantizationInfo uqinfo = UniformQuantizationInfo();
- uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform());
+ uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(),
+ _output->info()->quantization_info().uniform());
build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset));
build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale));
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
index 7b7b457632..80bfb63092 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
@@ -59,7 +59,8 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void
+ configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -74,7 +75,11 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ const ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
@@ -92,7 +97,10 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
private:
const ICLTensor *_input;
diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index a06c2eed75..622f6210b9 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -42,11 +43,8 @@ constexpr unsigned int vector_size_byte_opencl = 16;
Status validate_arguments(const ITensorInfo *output, const float start, const float end, const float step)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
- 1,
- DataType::U8, DataType::S8, DataType::QASYMM8,
- DataType::U16, DataType::S16,
- DataType::U32, DataType::S32,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
@@ -56,19 +54,22 @@ Status validate_arguments(const ITensorInfo *output, const float start, const fl
ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), "start value is outside the range of the data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), "end value is outside the range of the data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), "step value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()),
+ "start value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()),
+ "end value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()),
+ "step value is outside the range of the data type");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->num_dimensions() != 1, "Output has to be a 1-D tensor");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+ "Output tensor size is incorrect");
return Status{};
}
} // namespace
-CLRangeKernel::CLRangeKernel()
- : _start(0), _end(1), _step(1), _output(nullptr)
+CLRangeKernel::CLRangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -78,16 +79,18 @@ void CLRangeKernel::configure(ICLTensor *output, const float start, const float
configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
}
-void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRangeKernel::configure(
+ const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(output->info(), start, end, step));
// Configure kernel window
- unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- auto padding_info = get_padding_info({ output });
+ auto padding_info = get_padding_info({output});
_start = start;
_end = end;
@@ -100,10 +103,11 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
build_opts.add_option("-DSTART=" + support::cpp11::to_string(start));
build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step));
- if(is_data_type_quantized_asymmetric(output->info()->data_type()))
+ if (is_data_type_quantized_asymmetric(output->info()->data_type()))
{
const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(qinfo.offset));
diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h
index 1b94a099ed..65251a11e5 100644
--- a/src/core/CL/kernels/CLRangeKernel.h
+++ b/src/core/CL/kernels/CLRangeKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLRANGEKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index e5cfb997ca..70875a2d40 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -28,15 +28,15 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/AccessWindowStatic.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
@@ -47,23 +47,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- if(input->num_channels() == 1)
+ if (input->num_channels() == 1)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::S32, DataType::F16, DataType::F32);
}
else
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(axis == 0);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8,
+ "Not supported reduction operation for QASYMM8");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
- ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && (input->data_type() != DataType::QASYMM8)
- && (input->data_type() != DataType::QASYMM8_SIGNED));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer");
+ ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) &&
+ (input->data_type() != DataType::QASYMM8) &&
+ (input->data_type() != DataType::QASYMM8_SIGNED));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN),
+ "Not supported reduction operation, use CLArgMinMaxLayer");
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -79,33 +84,42 @@ CLReductionOperationKernel::CLReductionOperationKernel()
_type = CLKernelType::ELEMENTWISE;
}
-void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
}
-void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
_reduction_axis = axis;
_op = op;
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
+ auto_init_if_empty(*output->info(),
+ input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
// Set build options
CLBuildOptions build_opts;
DataType data_type = input->info()->data_type();
std::string data_type_promoted{};
- if(is_data_type_quantized(data_type))
+ if (is_data_type_quantized(data_type))
{
data_type_promoted = "int";
}
@@ -130,10 +144,14 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
- build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
- build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
-
- switch(op)
+ build_opts.add_option_if(is_data_type_quantized(data_type),
+ "-DOFFSET=" +
+ support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
+ build_opts.add_option_if(
+ is_data_type_quantized(data_type),
+ "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
+
+ switch (op)
{
case ReductionOperation::SUM_SQUARE:
build_opts.add_option(("-DOPERATION=square_sum"));
@@ -159,7 +177,7 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
std::string kernel_axis_name;
const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
- switch(axis)
+ switch (axis)
{
case 0:
{
@@ -187,13 +205,17 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps(vec_size));
- win.set(Window::DimX, Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step()));
+ win.set(Window::DimX,
+ Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step()));
ICLKernel::configure_internal(win);
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLReductionOperationKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
return Status{};
@@ -205,18 +227,19 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
- switch(_reduction_axis)
+ switch (_reduction_axis)
{
case 0:
{
// We use parallel reduction only in non quantized types
- if(is_serial_op)
+ if (is_serial_op)
{
// Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+ Window window_in{window};
+ window_in.set(Window::DimX,
+ Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
- Window out_window{ window };
+ Window out_window{window};
out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
Window in_slice = window_in.first_slice_window_1D();
@@ -228,8 +251,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
add_1D_tensor_argument(idx, _input, in_slice);
add_1D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice);
- }
- while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+ } while (window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
}
else
{
@@ -251,8 +273,9 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
case 1:
{
// Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+ Window window_in{window};
+ window_in.set(Window::DimY,
+ Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
Window in_slice = window_in.first_slice_window_2D();
Window out_slice = window.first_slice_window_2D();
@@ -262,15 +285,15 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
add_2D_tensor_argument(idx, _input, in_slice);
add_2D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice);
- }
- while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
}
break;
case 2:
{
// Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+ Window window_in{window};
+ window_in.set(Window::DimZ,
+ Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
Window in_slice = window_in.first_slice_window_3D();
Window out_slice = window.first_slice_window_3D();
@@ -280,14 +303,13 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
add_3D_tensor_argument(idx, _input, in_slice);
add_3D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice);
- }
- while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+ } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
}
break;
case 3:
{
// Get first input and output slices
- Window window_in{ window };
+ Window window_in{window};
window_in.set(3, Window::Dimension(0, 1, 1));
Window in_slice = window_in.first_slice_window_4D();
Window out_slice = window.first_slice_window_4D();
@@ -298,8 +320,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
add_4D_tensor_argument(idx, _input, in_slice);
add_4D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice);
- }
- while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+ } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
}
break;
default:
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h
index b456378746..2f94b2add3 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.h
+++ b/src/core/CL/kernels/CLReductionOperationKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -67,7 +68,11 @@ public:
* @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
* @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op);
/** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
*
@@ -79,7 +84,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index 3c74e80d33..9fd21943e8 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp
@@ -28,9 +28,10 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -51,13 +52,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+ "The width of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+ "The height of the input tensor must be a multiple of stride");
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+ const TensorInfo tensor_info_output =
+ output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -66,8 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
}
} // namespace
-CLReorgLayerKernel::CLReorgLayerKernel()
- : _input(nullptr), _output(nullptr)
+CLReorgLayerKernel::CLReorgLayerKernel() : _input(nullptr), _output(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -77,17 +80,22 @@ void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, in
configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
}
-void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t stride)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
- std::string kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
- const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ std::string kernel_name =
+ std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+ const size_t idx_channel =
+ get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
// Create kernel
CLBuildOptions build_opts;
@@ -98,7 +106,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
// Configure window
// auto inizialize the output tensor if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
+ auto_init_if_empty(*output->info(),
+ input->info()->clone()->set_tensor_shape(
+ misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
Window win = calculate_max_window(*output->info(), Steps());
@@ -119,7 +129,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride)
+Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input,
+ const arm_compute::ITensorInfo *output,
+ int32_t stride)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
@@ -139,7 +151,6 @@ void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ } while (window.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h
index 455a6170c6..f335071e9f 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.h
+++ b/src/core/CL/kernels/CLReorgLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLREORGLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index 0d70ff4f3c..79a0f03b1e 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -30,6 +30,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -49,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -60,8 +61,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-CLReverseKernel::CLReverseKernel()
- : _input(nullptr), _output(nullptr), _axis(nullptr)
+CLReverseKernel::CLReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -71,10 +71,13 @@ void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const
configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
}
-void CLReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
- auto padding_info = get_padding_info({ input, output, axis });
+ auto padding_info = get_padding_info({input, output, axis});
_input = input;
_output = output;
@@ -138,7 +141,6 @@ void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue)
add_1D_tensor_argument(idx, _axis, axis_slice);
add_4D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_4D(slice));
+ } while (collapsed.slide_window_slice_4D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h
index 4a21e4f802..fbd99dc883 100644
--- a/src/core/CL/kernels/CLReverseKernel.h
+++ b/src/core/CL/kernels/CLReverseKernel.h
@@ -60,7 +60,10 @@ public:
* @param[out] output Output tensor. Data type supported: Same as @p input
* @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *axis);
/** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
*
diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index c0e014e8b8..703c64d8d3 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp
@@ -30,10 +30,10 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
@@ -51,9 +51,11 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
- ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+ ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+ ((c->tensor_shape().num_dimensions() > 1) ||
+ (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -63,13 +65,16 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
}
} // namespace
-CLSelectKernel::CLSelectKernel()
- : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+CLSelectKernel::CLSelectKernel() : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelectKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *c,
+ const ICLTensor *x,
+ const ICLTensor *y,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info()));
@@ -80,7 +85,7 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
_output = output;
_has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
- auto padding_info = get_padding_info({ c, x, y, output });
+ auto padding_info = get_padding_info({c, x, y, output});
const unsigned int vec_size_x = adjust_vec_size(16 / x->info()->element_size(), x->info()->dimension(0));
const int vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x;
@@ -92,14 +97,14 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
// Create kernel
std::string kernel_name = "select";
- if(_has_same_rank)
+ if (_has_same_rank)
{
kernel_name += "_same_rank";
}
else
{
const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2;
- if(is_input_rank_greater_than_two)
+ if (is_input_rank_greater_than_two)
{
const size_t width = x->info()->tensor_shape().x();
const size_t height = x->info()->tensor_shape().y();
@@ -128,7 +133,8 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output));
return Status{};
@@ -142,7 +148,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
Window slice = collapsed.first_slice_window_3D();
- if(!_has_same_rank)
+ if (!_has_same_rank)
{
Window vector_slice = window.first_slice_window_1D();
vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -153,7 +159,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
do
{
unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor();
- if(_has_same_rank)
+ if (_has_same_rank)
{
add_3D_tensor_argument(idx, _c, slice);
}
@@ -162,7 +168,6 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h
index b8c10cd7cf..c4256fd743 100644
--- a/src/core/CL/kernels/CLSelectKernel.h
+++ b/src/core/CL/kernels/CLSelectKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLSELECTKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -60,7 +61,11 @@ public:
* @param[out] y Second input tensor. Data types supported: Same as @p x
* @param[in] output Output tensor. Data types supported: Same as @p x.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *c,
+ const ICLTensor *x,
+ const ICLTensor *y,
+ ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel
*
* @param[in] c Condition input tensor. Data types supported: U8.
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 3632ae2b03..f4c0839ad2 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -38,19 +39,22 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *block_info,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const DataLayout data_layout = input->data_layout();
const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -61,7 +65,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
return Status{};
}
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -70,9 +78,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+ TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+ input, block_shape_x, block_shape_y, padding_left, padding_right);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -88,16 +97,24 @@ CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel()
_type = CLKernelType::ELEMENTWISE;
}
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output)
{
configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
}
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
- auto padding_info = get_padding_info({ input, block_shape, paddings, output });
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+ auto padding_info = get_padding_info({input, block_shape, paddings, output});
_input = input;
_block_shape = block_shape;
@@ -111,14 +128,17 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
- _kernel = create_kernel(compile_context, "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(compile_context,
+ "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
@@ -126,22 +146,34 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
- ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+ padding_right, output);
}
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
- const Size2D &padding_right,
- ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+ input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+ padding_right, output->info()));
_input = input;
_output = output;
@@ -153,7 +185,8 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
@@ -166,22 +199,32 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x()));
build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y()));
build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y()));
- _kernel = create_kernel(compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(
+ compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
ICLKernel::configure_internal(win);
}
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
return Status{};
}
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
return Status{};
}
@@ -218,7 +261,6 @@ void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queu
add_3D_tensor_argument(idx, _output, slice_out);
enqueue(queue, *this, slice_out, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
index 4817cfeef2..f9dce9db47 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -63,7 +64,11 @@ public:
* @param[in] paddings 2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output);
/** Initialise the kernel's input and output. (Static block shape and paddings)
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -73,7 +78,12 @@ public:
* @param[in] padding_right The padding at the end of every dimension of the output tensor.
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+ void configure(const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output);
/** Initialise the kernel's input and output. (Static block shape and paddings)
*
* @param[in] compile_context The compile context to be used.
@@ -84,8 +94,13 @@ public:
* @param[in] padding_right The padding at the end of every dimension of the output tensor.
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
- ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -95,7 +110,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings)
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -107,7 +125,12 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ const ITensorInfo *output);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index c5ffdb588b..25662b5c62 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -45,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const DataLayout data_layout = input->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -64,8 +65,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
}
} // namespace
-CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape()
+CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -75,10 +75,13 @@ void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *out
configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
}
-void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape);
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
@@ -94,11 +97,14 @@ void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_contex
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
- _kernel = create_kernel(compile_context, "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(compile_context,
+ "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
@@ -136,7 +142,6 @@ void CLSpaceToDepthLayerKernel::run(const Window &window, cl::CommandQueue &queu
enqueue(queue, *this, slice_out, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
index bb1ac5f9a6..d0932919e0 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -61,7 +62,8 @@ public:
* @param[out] output Tensor output. Data types supported: same as @p input
* @param[in] block_shape Block shape value.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+ void
+ configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
/** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel.
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 075c93ab60..23e26716e7 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -30,10 +30,10 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,11 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -51,9 +55,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+ compute_stack_shape(*input, axis, num_tensors));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
@@ -61,7 +66,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
{
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -73,18 +79,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
}
} // namespace
-CLStackLayerKernel::CLStackLayerKernel()
- : _input(nullptr), _output(nullptr)
+CLStackLayerKernel::CLStackLayerKernel() : _input(nullptr), _output(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
-void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(
+ const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
{
configure(CLKernelLibrary::get().get_compile_context(), input, axis, idx_input, num_tensors, output);
}
-void CLStackLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -112,10 +123,15 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons
_kernel.setArg<cl_uint>(idx, idx_input);
}
-Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status CLStackLayerKernel::validate(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
return Status{};
}
diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h
index 2865127a90..d3c17f529c 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.h
+++ b/src/core/CL/kernels/CLStackLayerKernel.h
@@ -26,6 +26,7 @@
#define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -60,7 +61,8 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input.
*
*/
- void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+ void configure(
+ const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
/** Initialise the kernel's inputs and output
*
* @note Supported input tensor rank: up to 4
@@ -74,7 +76,12 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input.
*
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
*
* @note Supported input tensor rank: up to 4
@@ -88,7 +95,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index 9acbafdb19..a8f6112820 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -22,11 +22,13 @@
* SOFTWARE.
*/
#include "src/core/CL/kernels/CLStridedSliceKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/helpers/bit_ops.h"
@@ -37,9 +39,14 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -48,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
- {
- return i == 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
// Get expected output shape
- const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+ *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
// Checks output if configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -76,28 +80,33 @@ CLStridedSliceKernel::CLStridedSliceKernel()
_type = CLKernelType::ELEMENTWISE;
}
-void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSliceKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- auto padding_info = get_padding_info({ input, output });
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+ auto padding_info = get_padding_info({input, output});
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
const TensorShape &input_shape = input->tensor_shape();
Coordinates starts_abs;
Coordinates ends_abs;
Coordinates final_strides;
- std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
- input_shape,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ std::tie(starts_abs, ends_abs, final_strides) =
+ arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+ begin_mask, end_mask, shrink_axis_mask);
// Configure kernel window
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+ *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
Window win = calculate_max_window(*output, Steps());
@@ -108,29 +117,33 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
// Update window if needed
- if(multi_access_x)
+ if (multi_access_x)
{
Window &updated_window = win;
updated_window.set(Window::DimX,
- Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
+ Window::Dimension(updated_window.x().start(),
+ ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
}
ICLKernel::configure_internal(win);
// Create build options
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
- for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
+ for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
{
const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
- build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
- build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+ build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(starts_abs[i]));
+ build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(final_strides[i]));
build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i));
}
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+ std::max<int>(output_width_x - vec_size_x, 0)));
build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
- "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
- "-DSRC_DEPTH=1");
+ "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()), "-DSRC_DEPTH=1");
build_opts.add_option_if_else(output->num_dimensions() > 2,
"-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()),
"-DDST_DEPTH=1");
@@ -142,7 +155,7 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
_config_id = "strided_slice";
_config_id += "_";
_config_id += lower_string(string_from_data_type(input->data_type()));
- for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
{
_config_id += "_";
_config_id += support::cpp11::to_string(input->dimension(i));
@@ -156,11 +169,17 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSliceKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
return Status{};
}
@@ -170,8 +189,9 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
- auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
Window slice = window_collapsed.first_slice_window_4D();
@@ -182,7 +202,6 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
add_4D_tensor_argument(idx, src, slice);
add_4D_tensor_argument(idx, dst, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_4D(slice));
+ } while (window_collapsed.slide_window_slice_4D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h
index 4c201504f5..1cf5bcacec 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.h
+++ b/src/core/CL/kernels/CLStridedSliceKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
#include <cstdint>
@@ -53,9 +54,15 @@ public:
* @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask);
/** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
*
@@ -71,9 +78,14 @@ public:
* @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index 3e7015cfd2..fa996c4008 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -22,9 +22,11 @@
* SOFTWARE.
*/
#include "src/core/CL/kernels/CLTileKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -39,15 +41,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
- {
- return e == 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -55,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-CLTileKernel::CLTileKernel()
- : _input(nullptr), _output(nullptr)
+CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr)
{
_type = CLKernelType::ELEMENTWISE;
}
@@ -66,7 +65,10 @@ void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Mu
configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
}
-void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTileKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Multiples &multiples)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -104,15 +106,14 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
// Configure window without padding
Window win = calculate_max_window(*output->info());
- if(multi_access_x)
+ if (multi_access_x)
{
// If multi-access is enabled, no thread should cross the tile boundaries. This means we need
// as many threads as those to cover a single tile times multiples[0]. Note that if threads
// do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
// we don't need to pad the output
const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), size_win_x, vec_size_x));
+ win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x));
}
ICLKernel::configure_internal(win);
@@ -121,7 +122,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
_config_id = "tile";
_config_id += "_";
_config_id += lower_string(string_from_data_type(input->info()->data_type()));
- for(unsigned int i = 0; i < multiples.size(); ++i)
+ for (unsigned int i = 0; i < multiples.size(); ++i)
{
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(i));
@@ -150,7 +151,6 @@ void CLTileKernel::run(const Window &window, cl::CommandQueue &queue)
add_4D_tensor_argument(idx, _input, slice);
add_4D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_4D(slice));
+ } while (collapsed.slide_window_slice_4D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h
index 41752ca90b..c3486aecef 100644
--- a/src/core/CL/kernels/CLTileKernel.h
+++ b/src/core/CL/kernels/CLTileKernel.h
@@ -64,7 +64,10 @@ public:
* @param[out] output Destination tensor. Same as @p input
*
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Multiples &multiples);
/** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel
*
* @param[in] input Source tensor info. Data type supported: All.
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index 6a3f66fd5a..9980db42f3 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CPP/CPPTypes.h"
#include "arm_compute/core/Error.h"
+
#include "src/common/cpuinfo/CpuInfo.h"
#include "src/common/cpuinfo/CpuIsaInfo.h"
@@ -43,8 +44,7 @@ CPUInfo &CPUInfo::get()
return _cpuinfo;
}
-CPUInfo::CPUInfo()
- : _impl(std::make_unique<Impl>())
+CPUInfo::CPUInfo() : _impl(std::make_unique<Impl>())
{
_impl->info = cpuinfo::CpuInfo::build();
}
diff --git a/src/core/CPP/Validate.h b/src/core/CPP/Validate.h
index df192b5131..fe253508cf 100644
--- a/src/core/CPP/Validate.h
+++ b/src/core/CPP/Validate.h
@@ -38,8 +38,8 @@ namespace arm_compute
*
* @return Status
*/
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
- const ITensorInfo *tensor_info)
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
{
bool fp16_kernels_enabled = false;
#if defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS)
@@ -47,8 +47,9 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi
#endif /* defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS) */
ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
- ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled),
- function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
+ ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+ (tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled), function,
+ file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
return Status{};
}
@@ -61,8 +62,8 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi
*
* @return Status
*/
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
- const ITensorInfo *tensor_info)
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
{
bool bf16_kernels_enabled = false;
#if defined(ARM_COMPUTE_ENABLE_BF16)
@@ -70,8 +71,9 @@ inline Status error_on_unsupported_cpu_bf16(const char *function, const char *fi
#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
- ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled),
- function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
+ ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+ (tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled),
+ function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
return Status{};
}
@@ -84,8 +86,8 @@ inline Status error_on_unsupported_cpu_bf16(const char *function, const char *fi
*
* @return Status
*/
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
- const ITensor *tensor)
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensor *tensor)
{
ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info()));
@@ -101,8 +103,8 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi
*
* @return Status
*/
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
- const ITensor *tensor)
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensor *tensor)
{
ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(function, file, line, tensor->info()));
diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index 0f405d8e83..02686eb4f6 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
#include "arm_compute/core/Helpers.h"
+
#include "src/core/helpers/WindowHelpers.h"
#include <algorithm>
@@ -34,7 +35,11 @@ namespace arm_compute
namespace
{
template <typename T>
-std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &scores_in, std::vector<int> inds, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> SoftNMS(const ITensor *proposals,
+ std::vector<std::vector<T>> &scores_in,
+ std::vector<int> inds,
+ const BoxNMSLimitInfo &info,
+ int class_id)
{
std::vector<int> keep;
const int proposals_width = proposals->info()->dimension(1);
@@ -45,7 +50,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
std::vector<T> y2(proposals_width);
std::vector<T> areas(proposals_width);
- for(int i = 0; i < proposals_width; ++i)
+ for (int i = 0; i < proposals_width; ++i)
{
x1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
y1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -56,13 +61,13 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
// Note: Soft NMS scores have already been initialized with input scores
- while(!inds.empty())
+ while (!inds.empty())
{
// Find proposal with max score among remaining proposals
int max_pos = 0;
- for(unsigned int i = 1; i < inds.size(); ++i)
+ for (unsigned int i = 1; i < inds.size(); ++i)
{
- if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
+ if (scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
{
max_pos = i;
}
@@ -75,7 +80,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
inds.erase(inds.begin());
std::vector<int> sorted_indices_temp;
- for(auto idx : inds)
+ for (auto idx : inds)
{
const auto xx1 = std::max(x1[idx], x1[element]);
const auto yy1 = std::max(y1[idx], y1[element]);
@@ -89,7 +94,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
// Update scores based on computed IoU, overlap threshold and NMS method
T weight;
- switch(info.soft_nms_method())
+ switch (info.soft_nms_method())
{
case NMSType::LINEAR:
weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f;
@@ -106,7 +111,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
// Discard boxes with new scores below min threshold and update pending indices
scores_in[class_id][idx] *= weight;
- if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
+ if (scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
{
sorted_indices_temp.push_back(idx);
}
@@ -118,7 +123,10 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
}
template <typename T>
-std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> sorted_indices, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> NonMaximaSuppression(const ITensor *proposals,
+ std::vector<int> sorted_indices,
+ const BoxNMSLimitInfo &info,
+ int class_id)
{
std::vector<int> keep;
@@ -130,7 +138,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
std::vector<T> y2(proposals_width);
std::vector<T> areas(proposals_width);
- for(int i = 0; i < proposals_width; ++i)
+ for (int i = 0; i < proposals_width; ++i)
{
x1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
y1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -139,7 +147,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
}
- while(!sorted_indices.empty())
+ while (!sorted_indices.empty())
{
int i = sorted_indices.at(0);
keep.push_back(i);
@@ -148,7 +156,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
std::vector<int> new_indices;
sorted_indices_temp.erase(sorted_indices_temp.begin());
- for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
+ for (unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
{
const float xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]);
const float yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]);
@@ -163,8 +171,9 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
const float ctr_y = yy1 + (h / 2);
// If suppress_size is specified, filter the boxes based on their size and position
- const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && ctr_x < info.im_width() && ctr_y < info.im_height());
- if(ovr <= info.nms() && keep_size)
+ const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() &&
+ ctr_x < info.im_width() && ctr_y < info.im_height());
+ if (ovr <= info.nms() && keep_size)
{
new_indices.push_back(j);
}
@@ -172,7 +181,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
const unsigned int new_indices_size = new_indices.size();
std::vector<int> new_sorted_indices(new_indices_size);
- for(unsigned int i = 0; i < new_indices_size; ++i)
+ for (unsigned int i = 0; i < new_indices_size; ++i)
{
new_sorted_indices[i] = sorted_indices[new_indices[i] + 1];
}
@@ -184,7 +193,15 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
} // namespace
CPPBoxWithNonMaximaSuppressionLimitKernel::CPPBoxWithNonMaximaSuppressionLimitKernel()
- : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr),
+ : _scores_in(nullptr),
+ _boxes_in(nullptr),
+ _batch_splits_in(nullptr),
+ _scores_out(nullptr),
+ _boxes_out(nullptr),
+ _classes(nullptr),
+ _batch_splits_out(nullptr),
+ _keeps(nullptr),
+ _keeps_size(nullptr),
_info()
{
}
@@ -197,7 +214,7 @@ bool CPPBoxWithNonMaximaSuppressionLimitKernel::is_parallelisable() const
template <typename T>
void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
{
- const int batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
+ const int batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
const int num_classes = _scores_in->info()->dimension(0);
const int scores_count = _scores_in->info()->dimension(1);
std::vector<int> total_keep_per_batch(batch_size);
@@ -205,51 +222,48 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
int total_keep_count = 0;
std::vector<std::vector<T>> in_scores(num_classes, std::vector<T>(scores_count));
- for(int i = 0; i < scores_count; ++i)
+ for (int i = 0; i < scores_count; ++i)
{
- for(int j = 0; j < num_classes; ++j)
+ for (int j = 0; j < num_classes; ++j)
{
in_scores[j][i] = *reinterpret_cast<const T *>(_scores_in->ptr_to_element(Coordinates(j, i)));
}
}
int cur_start_idx = 0;
- for(int b = 0; b < batch_size; ++b)
+ for (int b = 0; b < batch_size; ++b)
{
// Skip first class if there is more than 1 except if the number of classes is 1.
const int j_start = (num_classes == 1 ? 0 : 1);
- for(int j = j_start; j < num_classes; ++j)
+ for (int j = j_start; j < num_classes; ++j)
{
std::vector<T> cur_scores(scores_count);
std::vector<int> inds;
- for(int i = 0; i < scores_count; ++i)
+ for (int i = 0; i < scores_count; ++i)
{
const T score = in_scores[j][i];
cur_scores[i] = score;
- if(score > _info.score_thresh())
+ if (score > _info.score_thresh())
{
inds.push_back(i);
}
}
- if(_info.soft_nms_enabled())
+ if (_info.soft_nms_enabled())
{
keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j);
}
else
{
std::sort(inds.data(), inds.data() + inds.size(),
- [&cur_scores](int lhs, int rhs)
- {
- return cur_scores[lhs] > cur_scores[rhs];
- });
+ [&cur_scores](int lhs, int rhs) { return cur_scores[lhs] > cur_scores[rhs]; });
keeps[j] = NonMaximaSuppression<T>(_boxes_in, inds, _info, j);
}
total_keep_count += keeps[j].size();
}
- if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
+ if (_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
{
// merge all scores (represented by indices) together and sort
auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]()
@@ -257,10 +271,10 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
std::vector<T> ret(total_keep_count);
int ret_idx = 0;
- for(unsigned int i = 1; i < keeps.size(); ++i)
+ for (unsigned int i = 1; i < keeps.size(); ++i)
{
auto &cur_keep = keeps[i];
- for(auto &ckv : cur_keep)
+ for (auto &ckv : cur_keep)
{
ret[ret_idx++] = in_scores[i][ckv];
}
@@ -273,13 +287,13 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
auto all_scores_sorted = get_all_scores_sorted();
const T image_thresh = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()];
- for(int j = 1; j < num_classes; ++j)
+ for (int j = 1; j < num_classes; ++j)
{
auto &cur_keep = keeps[j];
std::vector<int> new_keeps_j;
- for(auto &k : cur_keep)
+ for (auto &k : cur_keep)
{
- if(in_scores[j][k] >= image_thresh)
+ if (in_scores[j][k] >= image_thresh)
{
new_keeps_j.push_back(k);
}
@@ -293,40 +307,52 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
// Write results
int cur_out_idx = 0;
- for(int j = j_start; j < num_classes; ++j)
+ for (int j = j_start; j < num_classes; ++j)
{
- auto &cur_keep = keeps[j];
- auto cur_out_scores = reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
- auto cur_out_classes = reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
- const int box_column = (cur_start_idx + cur_out_idx) * 4;
-
- for(unsigned int k = 0; k < cur_keep.size(); ++k)
+ auto &cur_keep = keeps[j];
+ auto cur_out_scores =
+ reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+ auto cur_out_classes =
+ reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+ const int box_column = (cur_start_idx + cur_out_idx) * 4;
+
+ for (unsigned int k = 0; k < cur_keep.size(); ++k)
{
- cur_out_scores[k] = in_scores[j][cur_keep[k]];
- cur_out_classes[k] = static_cast<T>(j);
- auto cur_out_box_row0 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
- auto cur_out_box_row1 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
- auto cur_out_box_row2 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
- auto cur_out_box_row3 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
- *cur_out_box_row0 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
- *cur_out_box_row1 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
- *cur_out_box_row2 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
- *cur_out_box_row3 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
+ cur_out_scores[k] = in_scores[j][cur_keep[k]];
+ cur_out_classes[k] = static_cast<T>(j);
+ auto cur_out_box_row0 =
+ reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
+ auto cur_out_box_row1 =
+ reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
+ auto cur_out_box_row2 =
+ reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
+ auto cur_out_box_row3 =
+ reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
+ *cur_out_box_row0 =
+ *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
+ *cur_out_box_row1 =
+ *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
+ *cur_out_box_row2 =
+ *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
+ *cur_out_box_row3 =
+ *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
}
cur_out_idx += cur_keep.size();
}
- if(_keeps != nullptr)
+ if (_keeps != nullptr)
{
cur_out_idx = 0;
- for(int j = 0; j < num_classes; ++j)
+ for (int j = 0; j < num_classes; ++j)
{
- for(unsigned int i = 0; i < keeps[j].size(); ++i)
+ for (unsigned int i = 0; i < keeps[j].size(); ++i)
{
- *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast<T>(keeps[j].at(i));
+ *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) =
+ static_cast<T>(keeps[j].at(i));
}
- *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = keeps[j].size();
+ *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) =
+ keeps[j].size();
cur_out_idx += keeps[j].size();
}
}
@@ -334,17 +360,25 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
cur_start_idx += total_keep_count;
}
- if(_batch_splits_out != nullptr)
+ if (_batch_splits_out != nullptr)
{
- for(int b = 0; b < batch_size; ++b)
+ for (int b = 0; b < batch_size; ++b)
{
*reinterpret_cast<float *>(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b];
}
}
}
-void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
- ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in,
+ const ITensor *boxes_in,
+ const ITensor *batch_splits_in,
+ ITensor *scores_out,
+ ITensor *boxes_out,
+ ITensor *classes,
+ ITensor *batch_splits_out,
+ ITensor *keeps,
+ ITensor *keeps_size,
+ const BoxNMSLimitInfo info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32);
@@ -352,25 +386,28 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_
const unsigned int num_classes = scores_in->info()->dimension(0);
ARM_COMPUTE_UNUSED(num_classes);
- ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes");
- ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows");
+ ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0),
+ "First dimension of input boxes must be of size 4*num_classes");
+ ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1),
+ "Input scores and input boxes must have the same number of rows");
ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1));
ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4);
ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != classes->info()->dimension(0));
- if(keeps != nullptr)
+ if (keeps != nullptr)
{
- ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output");
+ ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr,
+ "keeps_size cannot be nullptr if keeps has to be provided as output");
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keeps_size, 1, DataType::U32);
ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0));
ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0));
}
- if(batch_splits_in != nullptr)
+ if (batch_splits_in != nullptr)
{
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in);
}
- if(batch_splits_out != nullptr)
+ if (batch_splits_out != nullptr)
{
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out);
}
@@ -399,7 +436,7 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run(const Window &window, const
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
- switch(_scores_in->info()->data_type())
+ switch (_scores_in->info()->data_type())
{
case DataType::F32:
run_nmslimit<float>();
diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
index c1187ff2b3..1224ec14a7 100644
--- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
@@ -35,15 +35,22 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size,
- const float score_threshold, const float iou_threshold)
+Status validate_arguments(const ITensorInfo *bboxes,
+ const ITensorInfo *scores,
+ const ITensorInfo *output_indices,
+ unsigned int max_output_size,
+ const float score_threshold,
+ const float iou_threshold)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, output_indices);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_indices, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2,
+ "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1,
+ "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1,
+ "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
@@ -55,15 +62,26 @@ Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores,
} // namespace
CPPNonMaximumSuppressionKernel::CPPNonMaximumSuppressionKernel()
- : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0)
+ : _input_bboxes(nullptr),
+ _input_scores(nullptr),
+ _output_indices(nullptr),
+ _max_output_size(0),
+ _score_threshold(0.f),
+ _iou_threshold(0.f),
+ _num_boxes(0)
{
}
-void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices,
- unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes,
+ const ITensor *input_scores,
+ ITensor *output_indices,
+ unsigned int max_output_size,
+ const float score_threshold,
+ const float iou_threshold)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input_bboxes, input_scores, output_indices);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), max_output_size, score_threshold, iou_threshold));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(),
+ max_output_size, score_threshold, iou_threshold));
auto_init_if_empty(*output_indices->info(), TensorShape(max_output_size), 1, DataType::U8, QuantizationInfo());
@@ -82,10 +100,15 @@ void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, cons
ICPPKernel::configure(win);
}
-Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices,
- unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes,
+ const ITensorInfo *scores,
+ const ITensorInfo *output_indices,
+ unsigned int max_output_size,
+ const float score_threshold,
+ const float iou_threshold)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
return Status{};
}
@@ -99,10 +122,10 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
// Auxiliary tensors
std::vector<int> indices_above_thd;
std::vector<float> scores_above_thd;
- for(unsigned int i = 0; i < _num_boxes; ++i)
+ for (unsigned int i = 0; i < _num_boxes; ++i)
{
const float score_i = *(reinterpret_cast<float *>(_input_scores->ptr_to_element(Coordinates(i))));
- if(score_i >= _score_threshold)
+ if (score_i >= _score_threshold)
{
scores_above_thd.emplace_back(score_i);
indices_above_thd.emplace_back(i);
@@ -114,12 +137,9 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
std::vector<unsigned int> sorted_indices;
sorted_indices.resize(num_above_thd);
std::iota(sorted_indices.data(), sorted_indices.data() + num_above_thd, 0);
- std::sort(std::begin(sorted_indices),
- std::end(sorted_indices),
+ std::sort(std::begin(sorted_indices), std::end(sorted_indices),
[&](unsigned int first, unsigned int second)
- {
- return scores_above_thd[first] > scores_above_thd[second];
- });
+ { return scores_above_thd[first] > scores_above_thd[second]; });
// Number of output is the minimum between max_detection and the scores above the threshold
const unsigned int num_output = std::min(_max_output_size, num_above_thd);
@@ -127,19 +147,20 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
std::vector<bool> visited(num_above_thd, false);
// Keep only boxes with small IoU
- for(unsigned int i = 0; i < num_above_thd; ++i)
+ for (unsigned int i = 0; i < num_above_thd; ++i)
{
// Check if the output is full
- if(output_idx >= num_output)
+ if (output_idx >= num_output)
{
break;
}
// Check if it was already visited, if not add it to the output and update the indices counter
- if(!visited[sorted_indices[i]])
+ if (!visited[sorted_indices[i]])
{
- *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = indices_above_thd[sorted_indices[i]];
- visited[sorted_indices[i]] = true;
+ *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) =
+ indices_above_thd[sorted_indices[i]];
+ visited[sorted_indices[i]] = true;
++output_idx;
}
else
@@ -148,28 +169,36 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
}
// Once added one element at the output check if the next ones overlap and can be skipped
- for(unsigned int j = i + 1; j < num_above_thd; ++j)
+ for (unsigned int j = i + 1; j < num_above_thd; ++j)
{
- if(!visited[sorted_indices[j]])
+ if (!visited[sorted_indices[j]])
{
// Calculate IoU
const unsigned int i_index = indices_above_thd[sorted_indices[i]];
const unsigned int j_index = indices_above_thd[sorted_indices[j]];
// Box-corner format: xmin, ymin, xmax, ymax
- const auto box_i_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
- const auto box_i_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
- const auto box_i_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
- const auto box_i_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
-
- const auto box_j_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
- const auto box_j_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
- const auto box_j_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
- const auto box_j_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
+ const auto box_i_xmin =
+ *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
+ const auto box_i_ymin =
+ *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
+ const auto box_i_xmax =
+ *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
+ const auto box_i_ymax =
+ *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
+
+ const auto box_j_xmin =
+ *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
+ const auto box_j_ymin =
+ *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
+ const auto box_j_xmax =
+ *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
+ const auto box_j_ymax =
+ *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
const float area_i = (box_i_xmax - box_i_xmin) * (box_i_ymax - box_i_ymin);
const float area_j = (box_j_xmax - box_j_xmin) * (box_j_ymax - box_j_ymin);
float overlap;
- if(area_i <= 0 || area_j <= 0)
+ if (area_i <= 0 || area_j <= 0)
{
overlap = 0.0f;
}
@@ -179,11 +208,12 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
const auto x_min_intersection = std::max<float>(box_i_xmin, box_j_xmin);
const auto y_max_intersection = std::min<float>(box_i_ymax, box_j_ymax);
const auto x_max_intersection = std::min<float>(box_i_xmax, box_j_xmax);
- const auto area_intersection = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) * std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
- overlap = area_intersection / (area_i + area_j - area_intersection);
+ const auto area_intersection = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) *
+ std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
+ overlap = area_intersection / (area_i + area_j - area_intersection);
}
- if(overlap > _iou_threshold)
+ if (overlap > _iou_threshold)
{
visited[sorted_indices[j]] = true;
}
@@ -192,7 +222,7 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
}
// The output could be full but not the output indices tensor
// Instead return values not valid we put -1
- for(; output_idx < _max_output_size; ++output_idx)
+ for (; output_idx < _max_output_size; ++output_idx)
{
*(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = -1;
}
diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 054c7bf05a..e68090d82b 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
// Validate configured output
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -65,7 +66,7 @@ void CPPPermuteKernel::run_permute(const Window &window)
// Create output window
Window window_out(window);
const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
- for(size_t d = 0; d <= _perm.num_dimensions(); ++d)
+ for (size_t d = 0; d <= _perm.num_dimensions(); ++d)
{
window_out.set(d, zero_window);
}
@@ -74,28 +75,32 @@ void CPPPermuteKernel::run_permute(const Window &window)
Iterator in(_input, window);
Iterator out(_output, window_out);
- if(_input->info()->num_dimensions() <= 3)
+ if (_input->info()->num_dimensions() <= 3)
{
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
- *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
+ *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+ },
+ in, out);
}
- else if(_input->info()->num_dimensions() >= 4)
+ else if (_input->info()->num_dimensions() >= 4)
{
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_strides[3];
- *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] +
+ id[3] * perm_strides[3];
+ *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+ },
+ in, out);
}
}
-CPPPermuteKernel::CPPPermuteKernel()
- : _func(), _input(nullptr), _output(nullptr), _perm()
+CPPPermuteKernel::CPPPermuteKernel() : _func(), _input(nullptr), _output(nullptr), _perm()
{
}
@@ -113,7 +118,7 @@ void CPPPermuteKernel::configure(const ITensor *input, ITensor *output, const Pe
_output = output;
_perm = perm;
- switch(input->info()->element_size())
+ switch (input->info()->element_size())
{
case 1:
_func = &CPPPermuteKernel::run_permute<uint8_t>;
@@ -152,7 +157,7 @@ void CPPPermuteKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- if(_func != nullptr)
+ if (_func != nullptr)
{
(this->*_func)(window);
}
diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp
index d2b54e412e..6ffb68e770 100644
--- a/src/core/CPP/kernels/CPPTopKVKernel.cpp
+++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp
@@ -34,32 +34,34 @@ namespace arm_compute
{
namespace
{
-template <typename T,
- typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
+template <typename T, typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
inline bool greater_than(T a, T b)
{
const T epsilon = std::numeric_limits<T>::epsilon();
return (a - b > epsilon);
}
-template < typename T,
- typename std::enable_if < !utils::traits::is_floating_point<T>::value, int >::type = 0 >
+template <typename T, typename std::enable_if<!utils::traits::is_floating_point<T>::value, int>::type = 0>
inline bool greater_than(T a, T b)
{
return (a > b);
}
-Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status validate_arguments(const ITensorInfo *predictions,
+ const ITensorInfo *targets,
+ ITensorInfo *output,
+ const unsigned int k)
{
ARM_COMPUTE_UNUSED(k);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(targets, 1, DataType::U32);
ARM_COMPUTE_RETURN_ERROR_ON(predictions->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(targets->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(targets->dimension(0) != predictions->dimension(1));
// Validate configured output
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), targets->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
@@ -72,22 +74,23 @@ Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *tar
template <typename T>
void CPPTopKVKernel::run_topkv()
{
- for(unsigned int i = 0; i < _batch_size; ++i)
+ for (unsigned int i = 0; i < _batch_size; ++i)
{
- const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{ i }));
- const auto predicted_value = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ target_class_id, i }));
+ const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{i}));
+ const auto predicted_value =
+ *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{target_class_id, i}));
// The variable rank indicates how many values there are before the target_class_id
unsigned int rank = 0;
- for(unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
+ for (unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
{
- const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ j, i }));
- if(greater_than(current_prediction, predicted_value))
+ const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{j, i}));
+ if (greater_than(current_prediction, predicted_value))
{
rank++;
}
}
- *(_output->ptr_to_element(Coordinates{ i })) = static_cast<uint8_t>(rank < _k);
+ *(_output->ptr_to_element(Coordinates{i})) = static_cast<uint8_t>(rank < _k);
}
}
@@ -96,7 +99,10 @@ CPPTopKVKernel::CPPTopKVKernel()
{
}
-void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+void CPPTopKVKernel::configure(const ITensor *predictions,
+ const ITensor *targets,
+ ITensor *output,
+ const unsigned int k)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(predictions, targets, output);
@@ -115,7 +121,10 @@ void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *target
ICPPKernel::configure(Window()); // Default 1 iteration window
}
-Status CPPTopKVKernel::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status CPPTopKVKernel::validate(const ITensorInfo *predictions,
+ const ITensorInfo *targets,
+ ITensorInfo *output,
+ const unsigned int k)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(predictions, targets, output, k));
return Status{};
@@ -129,7 +138,7 @@ bool CPPTopKVKernel::is_parallelisable() const
void CPPTopKVKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(window, info);
- switch(_predictions->info()->data_type())
+ switch (_predictions->info()->data_type())
{
case DataType::F32:
run_topkv<float>();
diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
index 7ef83fb2c4..b1efe32446 100644
--- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
#include "arm_compute/core/Helpers.h"
+
#include "src/core/helpers/WindowHelpers.h"
#include <cstddef>
@@ -31,8 +32,7 @@
namespace arm_compute
{
-CPPUpsampleKernel::CPPUpsampleKernel()
- : _input(nullptr), _output(nullptr), _info()
+CPPUpsampleKernel::CPPUpsampleKernel() : _input(nullptr), _output(nullptr), _info()
{
}
@@ -82,7 +82,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
const size_t element_size = _input->info()->element_size();
// The fill value is normally 0, but for quantized types '0' corresponds to the offset
- switch(_output->info()->data_type())
+ switch (_output->info()->data_type())
{
case DataType::QASYMM8:
{
@@ -102,7 +102,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
// Create window
Window window_out(window);
- if(data_layout == DataLayout::NCHW)
+ if (data_layout == DataLayout::NCHW)
{
window_out.set(Window::DimX, Window::Dimension(start_width, end_width, stride_width));
window_out.set(Window::DimY, Window::Dimension(start_height, end_height, stride_height));
@@ -117,10 +117,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
Iterator in(_input, window);
Iterator out(_output, window_out);
- execute_window_loop(window, [&](const Coordinates &)
- {
- memcpy(out.ptr(), in.ptr(), element_size);
- },
- in, out);
+ execute_window_loop(
+ window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index 5c8d45c987..679a93f9af 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp
@@ -36,9 +36,10 @@ Status arm_compute::create_error(ErrorCode error_code, std::string msg)
return Status(error_code, msg);
}
-Status arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg)
+Status
+arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg)
{
- std::array<char, 512> out{ 0 };
+ std::array<char, 512> out{0};
snprintf(out.data(), out.size(), "in %s %s:%d: %s", func, file, line, msg);
return Status(error_code, std::string(out.data()));
}
diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index 292acf8633..2d1a13cb33 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/core/GPUTarget.h"
+
#include "arm_compute/core/Log.h"
#include <map>
@@ -31,47 +32,47 @@ namespace
{
arm_compute::GPUTarget get_valhall_target(const std::string &version)
{
- if(version.find("G77") != std::string::npos)
+ if (version.find("G77") != std::string::npos)
{
return arm_compute::GPUTarget::G77;
}
- else if(version.find("G57") != std::string::npos)
+ else if (version.find("G57") != std::string::npos)
{
return arm_compute::GPUTarget::G57;
}
- if(version.find("G68") != std::string::npos)
+ if (version.find("G68") != std::string::npos)
{
return arm_compute::GPUTarget::G68;
}
- if(version.find("G78AE") != std::string::npos)
+ if (version.find("G78AE") != std::string::npos)
{
return arm_compute::GPUTarget::G78AE;
}
- if(version.find("G78") != std::string::npos)
+ if (version.find("G78") != std::string::npos)
{
return arm_compute::GPUTarget::G78;
}
- else if(version.find("G710") != std::string::npos)
+ else if (version.find("G710") != std::string::npos)
{
return arm_compute::GPUTarget::G710;
}
- else if(version.find("G610") != std::string::npos)
+ else if (version.find("G610") != std::string::npos)
{
return arm_compute::GPUTarget::G610;
}
- else if(version.find("G510") != std::string::npos)
+ else if (version.find("G510") != std::string::npos)
{
return arm_compute::GPUTarget::G510;
}
- else if(version.find("G310") != std::string::npos)
+ else if (version.find("G310") != std::string::npos)
{
return arm_compute::GPUTarget::G310;
}
- else if(version.find("G715") != std::string::npos)
+ else if (version.find("G715") != std::string::npos)
{
return arm_compute::GPUTarget::G715;
}
- else if(version.find("G615") != std::string::npos)
+ else if (version.find("G615") != std::string::npos)
{
return arm_compute::GPUTarget::G615;
}
@@ -83,39 +84,39 @@ arm_compute::GPUTarget get_valhall_target(const std::string &version)
arm_compute::GPUTarget get_bifrost_target(const std::string &version)
{
- if(version.find("G71") != std::string::npos)
+ if (version.find("G71") != std::string::npos)
{
return arm_compute::GPUTarget::G71;
}
- else if(version.find("G72") != std::string::npos)
+ else if (version.find("G72") != std::string::npos)
{
return arm_compute::GPUTarget::G72;
}
- else if(version.find("G51BIG") != std::string::npos)
+ else if (version.find("G51BIG") != std::string::npos)
{
return arm_compute::GPUTarget::G51BIG;
}
- else if(version.find("G51LIT") != std::string::npos)
+ else if (version.find("G51LIT") != std::string::npos)
{
return arm_compute::GPUTarget::G51LIT;
}
- else if(version.find("G51") != std::string::npos)
+ else if (version.find("G51") != std::string::npos)
{
return arm_compute::GPUTarget::G51;
}
- else if(version.find("G52LIT") != std::string::npos)
+ else if (version.find("G52LIT") != std::string::npos)
{
return arm_compute::GPUTarget::G52LIT;
}
- else if(version.find("G52") != std::string::npos)
+ else if (version.find("G52") != std::string::npos)
{
return arm_compute::GPUTarget::G52;
}
- else if(version.find("G76") != std::string::npos)
+ else if (version.find("G76") != std::string::npos)
{
return arm_compute::GPUTarget::G76;
}
- else if(version.find("G31") != std::string::npos)
+ else if (version.find("G31") != std::string::npos)
{
return arm_compute::GPUTarget::G31;
}
@@ -127,15 +128,15 @@ arm_compute::GPUTarget get_bifrost_target(const std::string &version)
arm_compute::GPUTarget get_midgard_target(const std::string &version)
{
- if(version.find("T600") != std::string::npos)
+ if (version.find("T600") != std::string::npos)
{
return arm_compute::GPUTarget::T600;
}
- else if(version.find("T700") != std::string::npos)
+ else if (version.find("T700") != std::string::npos)
{
return arm_compute::GPUTarget::T700;
}
- else if(version.find("T800") != std::string::npos)
+ else if (version.find("T800") != std::string::npos)
{
return arm_compute::GPUTarget::T800;
}
@@ -150,34 +151,16 @@ namespace arm_compute
{
const std::string &string_from_target(GPUTarget target)
{
- static std::map<GPUTarget, const std::string> gpu_target_map =
- {
- { GPUTarget::MIDGARD, "midgard" },
- { GPUTarget::BIFROST, "bifrost" },
- { GPUTarget::VALHALL, "valhall" },
- { GPUTarget::T600, "t600" },
- { GPUTarget::T700, "t700" },
- { GPUTarget::T800, "t800" },
- { GPUTarget::G71, "g71" },
- { GPUTarget::G72, "g72" },
- { GPUTarget::G51, "g51" },
- { GPUTarget::G51BIG, "g51big" },
- { GPUTarget::G51LIT, "g51lit" },
- { GPUTarget::G31, "g31" },
- { GPUTarget::G76, "g76" },
- { GPUTarget::G52, "g52" },
- { GPUTarget::G52LIT, "g52lit" },
- { GPUTarget::G77, "g77" },
- { GPUTarget::G57, "g57" },
- { GPUTarget::G78, "g78" },
- { GPUTarget::G68, "g68" },
- { GPUTarget::G78AE, "g78ae" },
- { GPUTarget::G710, "g710" },
- { GPUTarget::G610, "g610" },
- { GPUTarget::G510, "g510" },
- { GPUTarget::G310, "g310" },
- { GPUTarget::G715, "g715" },
- { GPUTarget::G615, "g615" },
+ static std::map<GPUTarget, const std::string> gpu_target_map = {
+ {GPUTarget::MIDGARD, "midgard"}, {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"},
+ {GPUTarget::T600, "t600"}, {GPUTarget::T700, "t700"}, {GPUTarget::T800, "t800"},
+ {GPUTarget::G71, "g71"}, {GPUTarget::G72, "g72"}, {GPUTarget::G51, "g51"},
+ {GPUTarget::G51BIG, "g51big"}, {GPUTarget::G51LIT, "g51lit"}, {GPUTarget::G31, "g31"},
+ {GPUTarget::G76, "g76"}, {GPUTarget::G52, "g52"}, {GPUTarget::G52LIT, "g52lit"},
+ {GPUTarget::G77, "g77"}, {GPUTarget::G57, "g57"}, {GPUTarget::G78, "g78"},
+ {GPUTarget::G68, "g68"}, {GPUTarget::G78AE, "g78ae"}, {GPUTarget::G710, "g710"},
+ {GPUTarget::G610, "g610"}, {GPUTarget::G510, "g510"}, {GPUTarget::G310, "g310"},
+ {GPUTarget::G715, "g715"}, {GPUTarget::G615, "g615"},
};
return gpu_target_map[target];
@@ -189,7 +172,7 @@ GPUTarget get_target_from_name(const std::string &device_name)
std::smatch name_parts;
const bool found_mali = std::regex_search(device_name, name_parts, mali_regex);
- if(!found_mali)
+ if (!found_mali)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Arm® Mali™ GPU. Target is set to default.");
return GPUTarget::MIDGARD;
@@ -203,22 +186,22 @@ GPUTarget get_target_from_name(const std::string &device_name)
// Work-out gpu target
GPUTarget gpu_target;
- if(target == 'G' || is_future_gpu)
+ if (target == 'G' || is_future_gpu)
{
// Check for Valhall or Bifrost
gpu_target = get_valhall_target(version);
- if(gpu_target == GPUTarget::UNKNOWN)
+ if (gpu_target == GPUTarget::UNKNOWN)
{
gpu_target = get_bifrost_target(version);
}
// Default GPUTarget
- if(gpu_target == GPUTarget::UNKNOWN)
+ if (gpu_target == GPUTarget::UNKNOWN)
{
gpu_target = GPUTarget::VALHALL;
}
}
- else if(target == 'T')
+ else if (target == 'T')
{
gpu_target = get_midgard_target(version);
}
@@ -228,7 +211,7 @@ GPUTarget get_target_from_name(const std::string &device_name)
}
// Report in case of unknown target
- if(gpu_target == GPUTarget::UNKNOWN)
+ if (gpu_target == GPUTarget::UNKNOWN)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Arm® Mali™ Mali GPU unknown. Target is set to the default one. (BIFROST)");
return GPUTarget::BIFROST;
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index 28e7f4c1e5..c801b097b5 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp
@@ -25,8 +25,11 @@
namespace arm_compute
{
-ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
- InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
+ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info,
+ const TensorShape &dst_shape,
+ InterpolationPolicy interpolate_policy,
+ SamplingPolicy sampling_policy,
+ bool border_undefined)
{
const DataLayout data_layout = src_info.data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -49,9 +52,9 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens
auto valid_end_out_y = std::min<int>(std::ceil(valid_end_in_y * scale_y), dst_shape[idx_height]);
// Handle valid points in case of the bi-linear interpolation
- if(border_undefined)
+ if (border_undefined)
{
- switch(interpolate_policy)
+ switch (interpolate_policy)
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
{
@@ -90,7 +93,7 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens
}
// Setup output valid region
- ValidRegion valid_region{ Coordinates(), dst_shape, dst_shape.num_dimensions() };
+ ValidRegion valid_region{Coordinates(), dst_shape, dst_shape.num_dimensions()};
valid_region.anchor.set(idx_width, std::max(0, valid_start_out_x));
valid_region.anchor.set(idx_height, std::max(0, valid_start_out_y));
@@ -109,14 +112,12 @@ const std::map<DataLayout, std::vector<DataLayoutDimension>> &get_layout_map()
constexpr DataLayoutDimension D = DataLayoutDimension::DEPTH;
constexpr DataLayoutDimension N = DataLayoutDimension::BATCHES;
- static const std::map<DataLayout, std::vector<DataLayoutDimension>> layout_map =
- {
- { DataLayout::NDHWC, { C, W, H, D, N } },
- { DataLayout::NCDHW, { W, H, D, C, N } },
- { DataLayout::NHWC, { C, W, H, N } },
- { DataLayout::NCHW, { W, H, C, N } }
- };
+ static const std::map<DataLayout, std::vector<DataLayoutDimension>> layout_map = {
+ {DataLayout::NDHWC, {C, W, H, D, N}},
+ {DataLayout::NCDHW, {W, H, D, C, N}},
+ {DataLayout::NHWC, {C, W, H, N}},
+ {DataLayout::NCHW, {W, H, C, N}}};
return layout_map;
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index 832801255f..923c5f8a85 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp
@@ -29,14 +29,18 @@
using namespace arm_compute;
-ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window,
+ const ValidRegion &input_valid_region) const
{
return compute_valid_region(window, input_valid_region, false, BorderSize(0));
}
-ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window,
+ ValidRegion input_valid_region,
+ bool border_undefined,
+ BorderSize border_size) const
{
- if(_info == nullptr)
+ if (_info == nullptr)
{
return input_valid_region;
}
@@ -45,7 +49,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
Coordinates old_anchor(anchor);
TensorShape &shape = input_valid_region.shape;
- if(!border_undefined)
+ if (!border_undefined)
{
border_size = BorderSize(0);
}
@@ -56,7 +60,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
// Additionally the valid region is shifted by the offset that is used by
// the kernel to write back output values.
anchor.set(0, std::max<int>(window.x().start() * _scale_x, anchor[0] + border_size.left) + _x);
- if(_info->num_dimensions() > 1)
+ if (_info->num_dimensions() > 1)
{
anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y);
}
@@ -69,15 +73,19 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
// old size is first converted into end points to compared against the
// execution window. Afterwards the new end points are converted back into
// a size of the region.
- shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right, (window.x().end() - window.x().step()) * _scale_x + _width) - anchor[0]);
- if(_info->num_dimensions() > 1)
+ shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right,
+ (window.x().end() - window.x().step()) * _scale_x + _width) -
+ anchor[0]);
+ if (_info->num_dimensions() > 1)
{
- shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]);
+ shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom,
+ (window.y().end() - window.y().step()) * _scale_y + _height) -
+ anchor[1]);
}
// For higher dimensions use the intersection of the window size and the
// valid region of the input
- for(size_t d = 2; d < _info->num_dimensions(); ++d)
+ for (size_t d = 2; d < _info->num_dimensions(); ++d)
{
anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
@@ -86,9 +94,12 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
return input_valid_region;
}
-void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined, const BorderSize &border_size)
+void AccessWindowRectangle::set_valid_region(const Window &window,
+ const ValidRegion &input_valid_region,
+ bool border_undefined,
+ const BorderSize &border_size)
{
- if(_info != nullptr)
+ if (_info != nullptr)
{
_info->set_valid_region(compute_valid_region(window, input_valid_region, border_undefined, border_size));
}
@@ -97,17 +108,16 @@ void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRe
bool AccessWindowRectangle::update_window_if_needed(Window &window) const
{
// Only update the window size if we can't use padding
- if(_info == nullptr || _info->is_resizable())
+ if (_info == nullptr || _info->is_resizable())
{
return false;
}
- PaddingSize needed = get_needed_padding(window);
+ PaddingSize needed = get_needed_padding(window);
PaddingSize available = _info->padding();
- if(needed.top <= available.top && needed.right <= available.right
- && needed.bottom <= available.bottom
- && needed.left <= available.left)
+ if (needed.top <= available.top && needed.right <= available.right && needed.bottom <= available.bottom &&
+ needed.left <= available.left)
{
return false;
}
@@ -124,12 +134,12 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
const int max_y = (window.y().end() - window.y().step()) * _scale_y + _y + _height;
// Adjust window start for Y dimension
- if(min_y < 0)
+ if (min_y < 0)
{
// Calculate rows available above the tensor
const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
- if(min_y < front_pad_y_available)
+ if (min_y < front_pad_y_available)
{
// Not enough padding available, need to shrink the window
int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y;
@@ -144,18 +154,19 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
}
// Adjust window end for Y dimension
- if(max_y > static_cast<int>(shape[1]))
+ if (max_y > static_cast<int>(shape[1]))
{
const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
// Calculate rows available below the tensor
const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
- if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+ if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
{
// Not enough padding available, need to shrink the window
- int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height;
- end = std::max<int>(window.y().start(), end / _scale_y);
+ int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) +
+ window.y().step() * _scale_y - _y - _height;
+ end = std::max<int>(window.y().start(), end / _scale_y);
window.set(1, Window::Dimension(window.y().start(), end, window.y().step()));
window_modified = true;
@@ -170,11 +181,14 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
// Adjust window start for X dimension
- if(min_x < 0)
+ if (min_x < 0)
{
- const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+ const int front_pad_x_available =
+ -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1],
+ stride_y - shape[0] * strides[0]) /
+ static_cast<int>(strides[0]);
- if(min_x < front_pad_x_available)
+ if (min_x < front_pad_x_available)
{
// Not enough padding available, need to shrink the window
int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x;
@@ -189,15 +203,16 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
}
// Adjust window end for X dimension
- if(max_x > static_cast<int>(shape[0]))
+ if (max_x > static_cast<int>(shape[0]))
{
const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
- if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+ if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
{
// Not enough padding available, need to shrink the window
- int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width;
- end = std::max<int>(window.x().start(), end / _scale_x);
+ int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) +
+ window.x().step() * _scale_x - _x - _width;
+ end = std::max<int>(window.x().start(), end / _scale_x);
window.set(0, Window::Dimension(window.x().start(), end, window.x().step()));
window_modified = true;
@@ -212,15 +227,15 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
bool AccessWindowRectangle::update_padding_if_needed(const Window &window)
{
// Only update the padding if the tensor allows it
- if(_info == nullptr || !_info->is_resizable())
+ if (_info == nullptr || !_info->is_resizable())
{
return false;
}
// Update strides in tensor info
- return _info->extend_padding( get_needed_padding(window));
+ return _info->extend_padding(get_needed_padding(window));
}
-PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window)const
+PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window) const
{
ARM_COMPUTE_ERROR_ON(_scale_x == 0);
ARM_COMPUTE_ERROR_ON(_scale_y == 0);
diff --git a/src/core/IKernel.cpp b/src/core/IKernel.cpp
index 31f1ec7a3f..fb7e095091 100644
--- a/src/core/IKernel.cpp
+++ b/src/core/IKernel.cpp
@@ -30,8 +30,7 @@ const Window &IKernel::window() const
return _window;
}
-IKernel::IKernel()
- : _window()
+IKernel::IKernel() : _window()
{
// Create an empty window to make sure the children classes set the window values themselves
_window.set(Window::DimX, Window::Dimension(0, 0, 1));
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 2f4354cc6f..4dc8ea959b 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp
@@ -35,7 +35,7 @@ namespace arm_compute
{
void ITensor::copy_from(const ITensor &src)
{
- if(&src == this)
+ if (&src == this)
{
return;
}
@@ -47,7 +47,7 @@ void ITensor::copy_from(const ITensor &src)
ARM_COMPUTE_ERROR_ON(src_info->num_channels() != dst_info->num_channels());
ARM_COMPUTE_ERROR_ON(src_info->element_size() != dst_info->element_size());
- for(size_t d = 0; d < src_info->num_dimensions(); d++)
+ for (size_t d = 0; d < src_info->num_dimensions(); d++)
{
ARM_COMPUTE_ERROR_ON(src_info->dimension(d) > dst_info->dimension(d));
}
@@ -66,11 +66,7 @@ void ITensor::copy_from(const ITensor &src)
const size_t line_size = src_info->element_size() * src_info->dimension(0);
execute_window_loop(
- win_src, [&](const Coordinates &)
- {
- memcpy(dst_it.ptr(), src_it.ptr(), line_size);
- },
- src_it, dst_it);
+ win_src, [&](const Coordinates &) { memcpy(dst_it.ptr(), src_it.ptr(), line_size); }, src_it, dst_it);
}
#ifdef ARM_COMPUTE_ASSERTS_ENABLED
@@ -87,10 +83,10 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
stream_status.copyfmt(s);
// Set precision
- if(is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
+ if (is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
{
int precision = io_fmt.precision;
- if(io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
+ if (io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
{
precision = std::numeric_limits<float>().max_digits10;
}
@@ -101,7 +97,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
size_t print_width = 0;
size_t print_height = 0;
int start_offset = 0;
- switch(io_fmt.print_region)
+ switch (io_fmt.print_region)
{
case IOFormatInfo::PrintRegion::NoPadding:
print_width = this->info()->dimension(0);
@@ -111,13 +107,14 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
case IOFormatInfo::PrintRegion::ValidRegion:
print_width = this->info()->valid_region().shape.x();
print_height = this->info()->valid_region().shape.y();
- start_offset = this->info()->offset_element_in_bytes(Coordinates(this->info()->valid_region().anchor.x(),
- this->info()->valid_region().anchor.y()));
+ start_offset = this->info()->offset_element_in_bytes(
+ Coordinates(this->info()->valid_region().anchor.x(), this->info()->valid_region().anchor.y()));
break;
case IOFormatInfo::PrintRegion::Full:
print_width = padding.left + this->info()->dimension(0) + padding.right;
print_height = padding.top + this->info()->dimension(1) + padding.bottom;
- start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] - padding.left * strides[0];
+ start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] -
+ padding.left * strides[0];
break;
default:
break;
@@ -129,16 +126,17 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
const uint8_t *ptr = this->buffer() + start_offset;
// Start printing
- for(size_t i = 0; i < slices2D; ++i)
+ for (size_t i = 0; i < slices2D; ++i)
{
// Find max_width of elements in slice to align columns
int max_element_width = 0;
- if(io_fmt.align_columns)
+ if (io_fmt.align_columns)
{
size_t offset = i * strides[2];
- for(size_t h = 0; h < print_height; ++h)
+ for (size_t h = 0; h < print_height; ++h)
{
- max_element_width = std::max<int>(max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
+ max_element_width = std::max<int>(
+ max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
offset += strides[1];
}
}
@@ -146,7 +144,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
// Print slice
{
size_t offset = i * strides[2];
- for(size_t h = 0; h < print_height; ++h)
+ for (size_t h = 0; h < print_height; ++h)
{
print_consecutive_elements(s, dt, ptr + offset, print_width, max_element_width, io_fmt.element_delim);
offset += strides[1];
diff --git a/src/core/ITensorPack.cpp b/src/core/ITensorPack.cpp
index 90f9a45039..0f8b0824f8 100644
--- a/src/core/ITensorPack.cpp
+++ b/src/core/ITensorPack.cpp
@@ -27,10 +27,9 @@
namespace arm_compute
{
-ITensorPack::ITensorPack(std::initializer_list<PackElement> l)
- : _pack()
+ITensorPack::ITensorPack(std::initializer_list<PackElement> l) : _pack()
{
- for(auto &e : l)
+ for (auto &e : l)
{
_pack[e.id] = e;
}
@@ -54,7 +53,7 @@ void ITensorPack::add_const_tensor(int id, const ITensor *tensor)
const ITensor *ITensorPack::get_const_tensor(int id) const
{
auto it = _pack.find(id);
- if(it != _pack.end())
+ if (it != _pack.end())
{
return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
}
@@ -81,4 +80,4 @@ bool ITensorPack::empty() const
{
return _pack.empty();
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
index e6d0e532c8..5f4d08d0f6 100644
--- a/src/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h
@@ -26,6 +26,7 @@
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include <arm_neon.h>
namespace arm_compute
@@ -90,7 +91,7 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
{
const static int32x4_t zero_s32 = vdupq_n_s32(0);
- if(result_shift < 0)
+ if (result_shift < 0)
{
in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
@@ -130,18 +131,13 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
// Convert S32 to S16
- const int16x8x2_t in_s16 =
- {
- {
- vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
- vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
- }
- };
+ const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
// Convert S16 to U8
uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
out_u8 = vmaxq_u8(out_u8, min_u8);
out_u8 = vminq_u8(out_u8, max_u8);
@@ -170,7 +166,7 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
int8x16_t max_s8,
bool is_bounded_relu)
{
- if(result_shift < 0)
+ if (result_shift < 0)
{
in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
@@ -204,18 +200,13 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
// Convert S32 to S16
- const int16x8x2_t in_s16 =
- {
- {
- vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
- vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
- }
- };
+ const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
// Convert S16 to S8
int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
out_s8 = vmaxq_s8(out_s8, min_s8);
out_s8 = vminq_s8(out_s8, max_s8);
@@ -247,8 +238,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32,
const static int32x4_t one_s32 = vdupq_n_s32(1);
// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
- int32x4x4_t res_shift_gt0 =
- {
+ int32x4x4_t res_shift_gt0 = {
vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]),
vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]),
vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]),
@@ -260,8 +250,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32,
res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]);
res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]);
- int32x4x4_t res_shift_lt0 =
- {
+ int32x4x4_t res_shift_lt0 = {
vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))),
vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))),
vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))),
@@ -273,8 +262,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32,
res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]);
// Select result depending on shift value
- const uint32x4x4_t mask_lt0 =
- {
+ const uint32x4x4_t mask_lt0 = {
#ifdef __aarch64__
vcltzq_s32(result_shift.val[0]),
vcltzq_s32(result_shift.val[1]),
@@ -300,18 +288,13 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32,
in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
// Convert S32 to S16
- const int16x8x2_t in_s16 =
- {
- {
- vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
- vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
- }
- };
+ const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
// Convert S16 to S8
int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
out_s8 = vmaxq_s8(out_s8, min_s8);
out_s8 = vminq_s8(out_s8, max_s8);
@@ -332,15 +315,20 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32,
*
* @return Quantized value
*/
-inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
- int32_t result_shift, int32_t result_offset_after_shift_s32,
- uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu)
+inline uint8_t finalize_quantization(int32_t in_value,
+ int result_fixedpoint_multiplier,
+ int32_t result_shift,
+ int32_t result_offset_after_shift_s32,
+ uint8_t min_u8,
+ uint8_t max_u8,
+ bool is_bounded_relu)
{
int32x4_t in_s32 = vdupq_n_s32(in_value);
- if(result_shift < 0)
+ if (result_shift < 0)
{
- in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+ in_value = vgetq_lane_s32(
+ vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
}
else
{
@@ -355,7 +343,7 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
// Bound the result
uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
}
@@ -375,15 +363,20 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
*
* @return Quantized value
*/
-inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
- int32_t result_shift, int32_t result_offset_after_shift_s32,
- int8_t min_s8, int8_t max_s8, bool is_bounded_relu)
+inline int8_t finalize_quantization(int32_t in_value,
+ int result_fixedpoint_multiplier,
+ int32_t result_shift,
+ int32_t result_offset_after_shift_s32,
+ int8_t min_s8,
+ int8_t max_s8,
+ bool is_bounded_relu)
{
int32x4_t in_s32 = vdupq_n_s32(in_value);
- if(result_shift < 0)
+ if (result_shift < 0)
{
- in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+ in_value = vgetq_lane_s32(
+ vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
}
else
{
@@ -399,7 +392,7 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult
// Bound the result
int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
}
@@ -416,17 +409,16 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult
*/
inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const int32x4_t voffset = vdupq_n_s32(offset);
- const float32x4_t vscale = vdupq_n_f32(scale);
- const float32x4x2_t vdequantized_input =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
- }
- };
+ const float scale = qi.scale;
+ const int offset = qi.offset;
+ const int32x4_t voffset = vdupq_n_s32(offset);
+ const float32x4_t vscale = vdupq_n_f32(scale);
+ const float32x4x2_t vdequantized_input = {{
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)),
+ vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)),
+ vscale),
+ }};
return vdequantized_input;
}
@@ -439,17 +431,14 @@ inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationI
*/
inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const int32x4_t voffset = vdupq_n_s32(offset);
- const float32x4_t vscale = vdupq_n_f32(scale);
- const float32x4x2_t vdequantized_input =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
- }
- };
+ const float scale = qi.scale;
+ const int offset = qi.offset;
+ const int32x4_t voffset = vdupq_n_s32(offset);
+ const float32x4_t vscale = vdupq_n_f32(scale);
+ const float32x4x2_t vdequantized_input = {{
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
+ }};
return vdequantized_input;
}
@@ -462,19 +451,24 @@ inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationIn
*/
inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const int32x4_t voffset = vdupq_n_s32(offset);
- const float32x4_t vscale = vdupq_n_f32(scale);
- const float32x4x4_t vdequantized_input =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
- }
- };
+ const float scale = qi.scale;
+ const int offset = qi.offset;
+ const int32x4_t voffset = vdupq_n_s32(offset);
+ const float32x4_t vscale = vdupq_n_f32(scale);
+ const float32x4x4_t vdequantized_input = {{
+ vmulq_f32(vcvtq_f32_s32(
+ vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+ vscale),
+ vmulq_f32(vcvtq_f32_s32(
+ vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+ vscale),
+ vmulq_f32(vcvtq_f32_s32(
+ vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+ vscale),
+ vmulq_f32(vcvtq_f32_s32(
+ vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+ vscale),
+ }};
return vdequantized_input;
}
@@ -487,19 +481,16 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantization
*/
inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const int32x4_t voffset = vdupq_n_s32(offset);
- const float32x4_t vscale = vdupq_n_f32(scale);
- const float32x4x4_t vdequantized_input =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
- }
- };
+ const float scale = qi.scale;
+ const int offset = qi.offset;
+ const int32x4_t voffset = vdupq_n_s32(offset);
+ const float32x4_t vscale = vdupq_n_f32(scale);
+ const float32x4x4_t vdequantized_input = {{
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+ }};
return vdequantized_input;
}
@@ -513,17 +504,22 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationI
*/
inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
{
- const int32x4_t voffset = vdupq_n_s32(offset);
- const float32x4_t vscale = vdupq_n_f32(scale);
- const float32x4x4_t vdequantized_input =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
- }
- };
+ const int32x4_t voffset = vdupq_n_s32(offset);
+ const float32x4_t vscale = vdupq_n_f32(scale);
+ const float32x4x4_t vdequantized_input = {{
+ vmulq_f32(vcvtq_f32_s32(
+ vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+ vscale),
+ vmulq_f32(vcvtq_f32_s32(
+ vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+ vscale),
+ vmulq_f32(vcvtq_f32_s32(
+ vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+ vscale),
+ vmulq_f32(vcvtq_f32_s32(
+ vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+ vscale),
+ }};
return vdequantized_input;
}
@@ -537,17 +533,14 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offs
*/
inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset)
{
- const int32x4_t voffset = vdupq_n_s32(offset);
- const float32x4_t vscale = vdupq_n_f32(scale);
- const float32x4x4_t vdequantized_input =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
- }
- };
+ const int32x4_t voffset = vdupq_n_s32(offset);
+ const float32x4_t vscale = vdupq_n_f32(scale);
+ const float32x4x4_t vdequantized_input = {{
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+ }};
return vdequantized_input;
}
@@ -560,15 +553,12 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offse
*/
inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale)
{
- const float32x4x4_t vdequantized_input =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
- }
- };
+ const float32x4x4_t vdequantized_input = {{
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
+ }};
return vdequantized_input;
}
@@ -581,16 +571,13 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale
*/
inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
{
- const float32x4_t vscale = vdupq_n_f32(scale);
- const float32x4x4_t vdequantized_input =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
- }
- };
+ const float32x4_t vscale = vdupq_n_f32(scale);
+ const float32x4x4_t vdequantized_input = {{
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+ }};
return vdequantized_input;
}
@@ -607,18 +594,15 @@ inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInf
const int offset = qi.offset;
const float32x4_t voffset = vdupq_n_f32(offset);
const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x4_t rf =
- {
- {
+ const int32x4x4_t rf = {{
#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+ vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+ vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+ vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+ vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
#endif //__aarch64__
- }
- };
+ }};
return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
}
@@ -635,18 +619,15 @@ inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizat
const int offset = qi.offset;
const float32x4_t voffset = vdupq_n_f32(offset);
const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x4_t rf =
- {
- {
+ const int32x4x4_t rf = {{
#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+ vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+ vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+ vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+ vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
#endif //__aarch64__
- }
- };
+ }};
return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
}
@@ -654,22 +635,19 @@ inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int3
{
const int32x4_t voffset = vdupq_n_s32(offset);
const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x4_t rf =
- {
- {
+ const int32x4x4_t rf = {{
#ifdef __aarch64__
- vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
- vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
- vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
- vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
#else //__aarch64__
- vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
- vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
- vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
- vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
#endif //__aarch64__
- }
- };
+ }};
return rf;
}
@@ -715,7 +693,7 @@ inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQua
auto rf = vquantize_internal(qv, qi.scale, qi.offset);
const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
- return { pa, pb };
+ return {pa, pb};
}
} // namespace arm_compute
diff --git a/src/core/NEON/NEAsymm.inl b/src/core/NEON/NEAsymm.inl
index ca2aea1e18..fd62fd4654 100644
--- a/src/core/NEON/NEAsymm.inl
+++ b/src/core/NEON/NEAsymm.inl
@@ -51,14 +51,14 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v
D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
// Convert float32 vectors to uint32 vectors
#if __aarch64__
- if(round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+ if (round_policy == RoundingPolicy::TO_NEAREST_EVEN)
{
A_u32x4 = vcvtnq_u32_f32(A_f32x4);
B_u32x4 = vcvtnq_u32_f32(B_f32x4);
C_u32x4 = vcvtnq_u32_f32(C_f32x4);
D_u32x4 = vcvtnq_u32_f32(D_f32x4);
}
- else if(round_policy == RoundingPolicy::TO_NEAREST_UP)
+ else if (round_policy == RoundingPolicy::TO_NEAREST_UP)
{
A_u32x4 = vcvtaq_u32_f32(A_f32x4);
B_u32x4 = vcvtaq_u32_f32(B_f32x4);
@@ -86,7 +86,7 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v
return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
}
-template <RoundingPolicy round_policy>
+template <RoundingPolicy round_policy>
inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
{
// Convert uint8 vectors to int16 vectors
@@ -110,14 +110,14 @@ inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x
C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
#if __aarch64__
- if(round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+ if (round_policy == RoundingPolicy::TO_NEAREST_EVEN)
{
A_s32x4 = vcvtnq_s32_f32(A_f32x4);
B_s32x4 = vcvtnq_s32_f32(B_f32x4);
C_s32x4 = vcvtnq_s32_f32(C_f32x4);
D_s32x4 = vcvtnq_s32_f32(D_f32x4);
}
- else if(round_policy == RoundingPolicy::TO_NEAREST_UP)
+ else if (round_policy == RoundingPolicy::TO_NEAREST_UP)
{
A_s32x4 = vcvtaq_s32_f32(A_f32x4);
B_s32x4 = vcvtaq_s32_f32(B_f32x4);
diff --git a/src/core/NEON/NEFixedPoint.inl b/src/core/NEON/NEFixedPoint.inl
index 8bff9c4a8e..fb403b6d26 100644
--- a/src/core/NEON/NEFixedPoint.inl
+++ b/src/core/NEON/NEFixedPoint.inl
@@ -30,13 +30,7 @@ namespace arm_compute
inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
{
- float32x4x2_t res =
- {
- {
- vmaxq_f32(a.val[0], b.val[0]),
- vmaxq_f32(a.val[1], b.val[1])
- }
- };
+ float32x4x2_t res = {{vmaxq_f32(a.val[0], b.val[0]), vmaxq_f32(a.val[1], b.val[1])}};
return res;
}
#endif /* DOXYGEN_SKIP_THIS */
diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl
index 1cbe669373..f875917988 100644
--- a/src/core/NEON/NEMath.inl
+++ b/src/core/NEON/NEMath.inl
@@ -29,19 +29,16 @@
namespace arm_compute
{
/** Logarithm polynomial coefficients */
-const std::array<float32x4_t, 8> log_tab =
-{
- {
- vdupq_n_f32(-2.29561495781f),
- vdupq_n_f32(-2.47071170807f),
- vdupq_n_f32(-5.68692588806f),
- vdupq_n_f32(-0.165253549814f),
- vdupq_n_f32(5.17591238022f),
- vdupq_n_f32(0.844007015228f),
- vdupq_n_f32(4.58445882797f),
- vdupq_n_f32(0.0141278216615f),
- }
-};
+const std::array<float32x4_t, 8> log_tab = {{
+ vdupq_n_f32(-2.29561495781f),
+ vdupq_n_f32(-2.47071170807f),
+ vdupq_n_f32(-5.68692588806f),
+ vdupq_n_f32(-0.165253549814f),
+ vdupq_n_f32(5.17591238022f),
+ vdupq_n_f32(0.844007015228f),
+ vdupq_n_f32(4.58445882797f),
+ vdupq_n_f32(0.0141278216615f),
+}};
/** Sin polynomial coefficients */
constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3)
@@ -54,7 +51,7 @@ inline float32x4_t prefer_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
{
#if __ARM_FEATURE_FMA
return vfmaq_f32(a, b, c);
-#else // __ARM_FEATURE_FMA
+#else // __ARM_FEATURE_FMA
return vmlaq_f32(a, b, c);
#endif // __ARM_FEATURE_FMA
}
@@ -73,13 +70,14 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val)
{
#ifdef __aarch64__
return vrndnq_f32(val);
-#else // __aarch64__
+#else // __aarch64__
static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f);
static const float32x4_t CONST_1_FLOAT = vdupq_n_f32(1.f);
static const int32x4_t CONST_1_INT = vdupq_n_s32(1);
const float32x4_t floor_val = vfloorq_f32(val);
const float32x4_t diff = vsubq_f32(val, floor_val);
- const float32x4_t fp32_upper_limit = vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U
+ const float32x4_t fp32_upper_limit =
+ vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U
/*
* 1. Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0).
@@ -95,12 +93,13 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val)
* Threshold upper limit with format |S|E(8bits)| Fraction(23bits) | = (23 + 127) << 23 (assuming positive sign): Adding 127, because 127 represents the actual zero in this format.
*/
- float32x4_t rounded_val = vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT),
- vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT),
- vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))),
- floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
+ float32x4_t rounded_val = vbslq_f32(
+ vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT),
+ vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT),
+ vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))),
+ floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
- float32x4_t result = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val);
+ float32x4_t result = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val);
return result;
#endif // __aarch64__
@@ -118,8 +117,8 @@ inline float32x2_t vinvsqrt_f32(float32x2_t x)
inline float32x4_t vinvsqrtq_f32(float32x4_t x)
{
float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+ sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+ sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
return sqrt_reciprocal;
}
@@ -152,8 +151,7 @@ inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t
return res;
}
-static const uint32_t exp_f32_coeff[] =
-{
+static const uint32_t exp_f32_coeff[] = {
0x3f7ffff6, // x^1: 0x1.ffffecp-1f
0x3efffedb, // x^2: 0x1.fffdb6p-2f
0x3e2aaf33, // x^3: 0x1.555e66p-3f
@@ -169,10 +167,12 @@ inline float32x4_t vexpq_f32(float32x4_t x)
const auto c4 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[3]));
const auto c5 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[4]));
- const auto shift = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
- const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
- const auto neg_ln2_hi = vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f
- const auto neg_ln2_lo = vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+ const auto shift = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+ const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+ const auto neg_ln2_hi =
+ vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f
+ const auto neg_ln2_lo =
+ vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
const auto inf = vdupq_n_f32(std::numeric_limits<float>::infinity());
const auto max_input = vdupq_n_f32(88.37f); // Approximately ln(2^127.5)
@@ -224,9 +224,9 @@ inline float32x4_t vexpq_f32(float32x4_t x)
#ifdef __aarch64__
inline float32x4_t verfq_f32(float32x4_t x)
{
- static const float erffdata[4] = { 0.278393f, 0.230389f, 0.000972f, 0.078108f };
+ static const float erffdata[4] = {0.278393f, 0.230389f, 0.000972f, 0.078108f};
static const float32x4_t coeffdata = vld1q_f32(erffdata);
- static const float32x4_t onev{ vdupq_n_f32(1.0f) };
+ static const float32x4_t onev{vdupq_n_f32(1.0f)};
uint32x4_t selector = vcltzq_f32(x);
@@ -287,10 +287,12 @@ inline float32x4_t vtanhq_f32(float32x4_t val)
float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
// x * (1 - x^2/3) if |x| < 5.e-3 or (exp2x - 1) / (exp2x + 1) otherwise
- float32x4_t exp2x = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x));
- float32x4_t num = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x));
- float32x4_t den = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num));
- float32x4_t tanh = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den));
+ float32x4_t exp2x =
+ vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x));
+ float32x4_t num =
+ vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x));
+ float32x4_t den = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num));
+ float32x4_t tanh = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den));
return tanh;
}
@@ -456,30 +458,23 @@ inline float32x4x4_t convert_to_float32x4x4(const int8x16_t &in)
inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
{
- out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
- vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
- out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
- vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
- out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
- vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
+ out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
+ out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
+ out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
}
inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
{
- const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
- vqmovn_u32(vcvtq_u32_f32(in.val[1])));
- const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
- vqmovn_u32(vcvtq_u32_f32(in.val[3])));
- out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+ const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), vqmovn_u32(vcvtq_u32_f32(in.val[1])));
+ const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), vqmovn_u32(vcvtq_u32_f32(in.val[3])));
+ out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
}
inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
{
- const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])),
- vqmovn_s32(vcvtq_s32_f32(in.val[1])));
- const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])),
- vqmovn_s32(vcvtq_s32_f32(in.val[3])));
- out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
+ const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), vqmovn_s32(vcvtq_s32_f32(in.val[1])));
+ const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), vqmovn_s32(vcvtq_s32_f32(in.val[3])));
+ out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
}
template <>
@@ -552,8 +547,8 @@ inline float16x4_t vinvsqrt_f16(float16x4_t x)
inline float16x8_t vinvsqrtq_f16(float16x8_t x)
{
float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
- sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
- sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+ sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+ sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
return sqrt_reciprocal;
}
@@ -602,8 +597,8 @@ inline float16x4_t vtanh_rational_approx_f16(float16x4_t x16)
inline float16x8_t vtanhq_f16(float16x8_t x)
{
// Split into high/low and use rational approximation on both parts exactly
- const float16x8_t tanh = vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)),
- vtanh_rational_approx_f16(vget_high_f16(x)));
+ const float16x8_t tanh =
+ vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)), vtanh_rational_approx_f16(vget_high_f16(x)));
// tanh(x) == sign(x) to F16 precision for |x| >= 4.508, use sign after this
const float16x8_t ONE = vdupq_n_f16(1.0f);
diff --git a/src/core/NEON/NESymm.h b/src/core/NEON/NESymm.h
index e6644577a1..ec246efc8c 100644
--- a/src/core/NEON/NESymm.h
+++ b/src/core/NEON/NESymm.h
@@ -25,7 +25,9 @@
#define ARM_COMPUTE_NESYMM_H
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
#include "src/core/NEON/NEMath.h"
+
#include <arm_neon.h>
namespace arm_compute
@@ -49,13 +51,10 @@ using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 1
* @return Quantized values
*/
template <bool is_bounded_relu>
-int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
- int result_fixedpoint_multiplier,
- int32_t result_shift,
- int16x8_t min_s16,
- int16x8_t max_s16)
+int16x8_t finalize_quantization_int16(
+ int32x4x2_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int16x8_t min_s16, int16x8_t max_s16)
{
- if(result_shift < 0)
+ if (result_shift < 0)
{
in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift));
in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift));
@@ -76,7 +75,7 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
// Convert S32 to S16
int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1]));
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
out_s16 = vmaxq_s16(out_s16, min_s16);
out_s16 = vminq_s16(out_s16, max_s16);
@@ -98,13 +97,14 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
* @return Quantized values
*/
template <bool is_bounded_relu>
-inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier,
- int32_t result_shift, int16_t min_s16, int16_t max_s16)
+inline int16_t finalize_quantization_int16(
+ int32_t in_value, int result_fixedpoint_multiplier, int32_t result_shift, int16_t min_s16, int16_t max_s16)
{
- if(result_shift < 0)
+ if (result_shift < 0)
{
- const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) * static_cast<int64_t>(result_fixedpoint_multiplier);
- in_value = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
+ const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) *
+ static_cast<int64_t>(result_fixedpoint_multiplier);
+ in_value = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
}
else
{
@@ -117,7 +117,7 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi
// Bound the result
int16_t out_s16 = static_cast<int16_t>(std::max<int32_t>(-32768, std::min<int32_t>(32767, in_value)));
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
out_s16 = static_cast<int16_t>(std::max(min_s16, std::min(max_s16, out_s16)));
}
@@ -134,14 +134,9 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi
*/
inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale)
{
- const float32x4_t vscale = vdupq_n_f32(scale);
- const float32x4x2_t vdequantized_input =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)
- }
- };
+ const float32x4_t vscale = vdupq_n_f32(scale);
+ const float32x4x2_t vdequantized_input = {{vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)}};
return vdequantized_input;
}
@@ -156,18 +151,13 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
{
const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x2_t rf =
- {
- {
+ const int32x4x2_t rf = {{
#ifdef __aarch64__
- vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+ vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
#else //__aarch64__
- vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+ vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
#endif //__aarch64__
- }
- };
+ }};
return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
}
@@ -180,17 +170,14 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
*/
inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const float32x4_t vscale = vdupq_n_f32(scale);
- const float32x4x4_t vdequantized_input =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
- }
- };
+ const float scale = qi.scale;
+ const float32x4_t vscale = vdupq_n_f32(scale);
+ const float32x4x4_t vdequantized_input = {{
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
+ }};
return vdequantized_input;
}
@@ -206,24 +193,20 @@ inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQua
const float scale = qi.scale;
ARM_COMPUTE_ERROR_ON(scale == 0.f);
const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x4_t rf =
- {
- {
+ const int32x4x4_t rf = {{
#ifdef __aarch64__
- vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
- vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
- vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+ vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+ vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+ vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+ vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
#else //__aarch64__
- vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
- vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
- vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+ vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+ vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+ vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+ vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
#endif //__aarch64__
- }
- };
- const qsymm16x8x2_t res =
- {
+ }};
+ const qsymm16x8x2_t res = {
vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])),
vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])),
};
diff --git a/src/core/NEON/SVEAsymm.h b/src/core/NEON/SVEAsymm.h
index eea2627c62..a448cde475 100644
--- a/src/core/NEON/SVEAsymm.h
+++ b/src/core/NEON/SVEAsymm.h
@@ -26,6 +26,7 @@
#if defined(ARM_COMPUTE_ENABLE_SVE2)
#include "src/core/NEON/SVEMath.h"
+
#include <arm_sve.h>
namespace arm_compute
@@ -70,10 +71,18 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, float scal
const auto voffset = svdup_n_s32(offset);
const auto vscale = svdup_n_f32(scale);
const svfloat32x4_t vdequantized_input = svcreate4_f32(
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), vscale));
+ svmul_f32_z(pg,
+ svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)),
+ vscale),
+ svmul_f32_z(pg,
+ svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)),
+ vscale),
+ svmul_f32_z(pg,
+ svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)),
+ vscale),
+ svmul_f32_z(pg,
+ svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)),
+ vscale));
return vdequantized_input;
}
@@ -104,10 +113,10 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale
const auto voffset = svdup_n_s32(offset);
const auto vscale = svdup_n_f32(scale);
const svfloat32x4_t vdequantized_input = svcreate4_f32(
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale));
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale));
return vdequantized_input;
}
@@ -135,11 +144,11 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const Unifo
*/
inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svfloat32x4_t vscale)
{
- const svfloat32x4_t vdequantized_input = svcreate4_f32(
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)));
+ const svfloat32x4_t vdequantized_input =
+ svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)));
return vdequantized_input;
}
@@ -153,12 +162,12 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svflo
*/
inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale)
{
- const auto vscale = svdup_n_f32(scale);
- const svfloat32x4_t vdequantized_input = svcreate4_f32(
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale));
+ const auto vscale = svdup_n_f32(scale);
+ const svfloat32x4_t vdequantized_input =
+ svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale));
return vdequantized_input;
}
diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h
index 5ada7ae0ff..6d69b330ba 100644
--- a/src/core/NEON/SVEMath.h
+++ b/src/core/NEON/SVEMath.h
@@ -28,6 +28,7 @@
#include "src/core/NEON/wrapper/intrinsics/svcvt.h"
#include "src/core/NEON/wrapper/intrinsics/svdup_n.h"
#include "src/core/NEON/wrapper/intrinsics/svreinterpret.h"
+
#include <arm_sve.h>
#include <array>
@@ -181,9 +182,12 @@ svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b);
* @return The converted integer vector
*/
template <typename int_vec_type>
-int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3);
+int_vec_type convert_float_to_int(const svfloat32_t &in_0,
+ const svfloat32_t &in_1,
+ const svfloat32_t &in_2,
+ const svfloat32_t &in_3);
} // namespace arm_compute
#include "src/core/NEON/SVEMath.inl"
#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#endif /* ARM_COMPUTE_SVEMATH_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_SVEMATH_H */
diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl
index 8973d0b273..b30125dcb7 100644
--- a/src/core/NEON/SVEMath.inl
+++ b/src/core/NEON/SVEMath.inl
@@ -32,8 +32,16 @@
namespace arm_compute
{
-inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t coeff_1, svfloat32_t coeff_2, svfloat32_t coeff_3,
- svfloat32_t coeff_4, svfloat32_t coeff_5, svfloat32_t coeff_6, svfloat32_t coeff_7, svfloat32_t coeff_8)
+inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg,
+ svfloat32_t x,
+ svfloat32_t coeff_1,
+ svfloat32_t coeff_2,
+ svfloat32_t coeff_3,
+ svfloat32_t coeff_4,
+ svfloat32_t coeff_5,
+ svfloat32_t coeff_6,
+ svfloat32_t coeff_7,
+ svfloat32_t coeff_8)
{
const auto A = svmla_f32_z(pg, coeff_1, coeff_5, x);
const auto B = svmla_f32_z(pg, coeff_3, coeff_7, x);
@@ -45,8 +53,16 @@ inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t c
return res;
}
-inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, svfloat16_t x, svfloat16_t coeff_1, svfloat16_t coeff_2, svfloat16_t coeff_3,
- svfloat16_t coeff_4, svfloat16_t coeff_5, svfloat16_t coeff_6, svfloat16_t coeff_7, svfloat16_t coeff_8)
+inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg,
+ svfloat16_t x,
+ svfloat16_t coeff_1,
+ svfloat16_t coeff_2,
+ svfloat16_t coeff_3,
+ svfloat16_t coeff_4,
+ svfloat16_t coeff_5,
+ svfloat16_t coeff_6,
+ svfloat16_t coeff_7,
+ svfloat16_t coeff_8)
{
const auto A = svmla_f16_z(pg, coeff_1, coeff_5, x);
const auto B = svmla_f16_z(pg, coeff_3, coeff_7, x);
@@ -90,15 +106,17 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x)
const auto c4 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[3]));
const auto c5 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[4]));
- const auto shift = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
- const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
- const auto neg_ln2_hi = svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f
- const auto neg_ln2_lo = svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+ const auto shift = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+ const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+ const auto neg_ln2_hi =
+ svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f
+ const auto neg_ln2_lo =
+ svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
const auto inf = svdup_n_f32(std::numeric_limits<float>::infinity());
- const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
+ const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
const auto zero = svdup_n_f32(0.f);
- const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
+ const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
// Range reduction:
// e^x = 2^n * e^r
@@ -114,23 +132,23 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x)
// (i.e. n) because the decimal part has been pushed out and lost.
// * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent
// in FP32 format. Left shifting z by 23 bits will result in 2^n.
- const auto z = svmla_f32_z(pg, shift, x, inv_ln2);
- const auto n = svsub_f32_z(pg, z, shift);
- const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n
+ const auto z = svmla_f32_z(pg, shift, x, inv_ln2);
+ const auto n = svsub_f32_z(pg, z, shift);
+ const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n
// The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32.
// This outperforms longer Taylor series (3-4 tabs) both in term of accuracy and performance.
const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi);
- const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
+ const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
// Compute the truncated Taylor series of e^r.
// poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
const auto r2 = svmul_f32_z(pg, r, r);
- const auto p1 = svmul_f32_z(pg, c1, r);
- const auto p23 = svmla_f32_z(pg, c2, c3, r);
- const auto p45 = svmla_f32_z(pg, c4, c5, r);
- const auto p2345 = svmla_f32_z(pg, p23, p45, r2);
+ const auto p1 = svmul_f32_z(pg, c1, r);
+ const auto p23 = svmla_f32_z(pg, c2, c3, r);
+ const auto p45 = svmla_f32_z(pg, c4, c5, r);
+ const auto p2345 = svmla_f32_z(pg, p23, p45, r2);
const auto p12345 = svmla_f32_z(pg, p1, p2345, r2);
auto poly = svmla_f32_z(pg, scale, p12345, scale);
@@ -213,7 +231,8 @@ inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x)
auto val = svreinterpret_f32_s32(svsub_s32_z(pg, svreinterpret_s32_f32(x), svlsl_n_s32_z(pg, m, 23)));
// Polynomial Approximation
- auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6, log_tab_7, log_tab_8);
+ auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6,
+ log_tab_7, log_tab_8);
// Reconstruct
poly = svmla_f32_z(pg, poly, svcvt_f32_s32_z(pg, m), CONST_LN2);
@@ -259,7 +278,8 @@ inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val)
//Find positive or negative
const auto c_v = svabs_z(pg, wrapper::svcvt_z<int32_t>(pg, svmul_z(pg, val, ipi_v)));
const auto sign_v = svcmple(pg, val, wrapper::svdup_n(ScalarType(0)));
- const auto odd_v = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))), wrapper::svdup_n(IntType(0)));
+ const auto odd_v = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))),
+ wrapper::svdup_n(IntType(0)));
auto neg_v = sveor_z(pg, odd_v, sign_v);
@@ -347,7 +367,10 @@ inline svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b)
#if defined(ARM_COMPUTE_ENABLE_SVE2)
template <>
-inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0,
+ const svfloat32_t &in_1,
+ const svfloat32_t &in_2,
+ const svfloat32_t &in_3)
{
svuint8_t out;
const auto all_true_pg = svptrue_b32();
@@ -381,7 +404,10 @@ inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const
}
template <>
-inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0,
+ const svfloat32_t &in_1,
+ const svfloat32_t &in_2,
+ const svfloat32_t &in_3)
{
svint8_t out;
const auto all_true_pg = svptrue_b32();
diff --git a/src/core/NEON/SVESymm.h b/src/core/NEON/SVESymm.h
index 6808577681..288d45d979 100644
--- a/src/core/NEON/SVESymm.h
+++ b/src/core/NEON/SVESymm.h
@@ -28,6 +28,7 @@
#if defined(ARM_COMPUTE_ENABLE_SVE2)
#include "src/core/NEON/SVEMath.h"
+
#include <arm_sve.h>
namespace arm_compute
@@ -42,8 +43,10 @@ namespace arm_compute
*/
inline svfloat32x2_t svdequantize_qsymm16_z(svbool_t pg, const svint16_t &qv, float scale)
{
- const auto vscale = svdup_n_f32(scale);
- const svfloat32x2_t vdequantized_input = svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale), svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale));
+ const auto vscale = svdup_n_f32(scale);
+ const svfloat32x2_t vdequantized_input =
+ svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale));
return vdequantized_input;
}
@@ -76,13 +79,13 @@ inline svint16_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x2_t qv, float
*/
inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint16x2_t qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const auto vscale = svdup_n_f32(scale);
- const svfloat32x4_t vdequantized_input = svcreate4_f32(
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale),
- svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale));
+ const float scale = qi.scale;
+ const auto vscale = svdup_n_f32(scale);
+ const svfloat32x4_t vdequantized_input =
+ svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale));
return vdequantized_input;
}
@@ -112,4 +115,4 @@ inline svint16x2_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x4_t qv, con
} // namespace arm_compute
#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-#endif // ARM_COMPUTE_NESYMM_H \ No newline at end of file
+#endif // ARM_COMPUTE_NESYMM_H
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 108b199df7..deb89996a9 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -28,18 +28,17 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
+#include "src/core/NEON/kernels/batchnormalization/impl/list.h"
#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/NEON/kernels/batchnormalization/impl/list.h"
-#include "src/core/common/Registrars.h"
-
#include <map>
namespace arm_compute
@@ -52,8 +51,15 @@ struct BatchNormalizationSelectorData
const CPUInfo &ci;
};
using BatchNormalizationSelectorPtr = std::add_pointer<bool(const BatchNormalizationSelectorData &data)>::type;
-using BatchNormalizationKernelPtr = std::add_pointer<void(ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
- float, ActivationLayerInfo &, const Window &)>::type;
+using BatchNormalizationKernelPtr = std::add_pointer<void(ITensor *,
+ ITensor *,
+ const ITensor *,
+ const ITensor *,
+ const ITensor *,
+ const ITensor *,
+ float,
+ ActivationLayerInfo &,
+ const Window &)>::type;
struct BatchNormalizationKernel
{
@@ -62,41 +68,32 @@ struct BatchNormalizationKernel
BatchNormalizationKernelPtr ukernel;
};
-static const BatchNormalizationKernel available_kernels[] =
-{
+static const BatchNormalizationKernel available_kernels[] = {
#if defined(ARM_COMPUTE_ENABLE_SVE)
- {
- "sve_fp16_batch_normalization",
- [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
- REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)
- },
- {
- "sve_fp32_batch_normalization",
- [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
- REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)
- },
+ {"sve_fp16_batch_normalization",
+ [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
+ REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)},
+ {"sve_fp32_batch_normalization",
+ [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
+ REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)},
#endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON)
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "neon_fp16_batch_normalization",
- [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)
- },
+ {"neon_fp16_batch_normalization",
+ [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)},
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- {
- "neon_fp32_batch_normalization",
- [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)
- },
+ {"neon_fp32_batch_normalization",
+ [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)},
#endif /* !defined(ARM_COMPUTE_ENABLE_NEON) */
};
const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -104,25 +101,31 @@ const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelec
return nullptr;
}
-Status
-validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_UNUSED(epsilon);
- const auto *uk = get_implementation(BatchNormalizationSelectorData{ input->data_type(), CPUInfo::get() });
+ const auto *uk = get_implementation(BatchNormalizationSelectorData{input->data_type(), CPUInfo::get()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
- if(act_info.enabled())
+ if (act_info.enabled())
{
ActivationLayerInfo::ActivationFunction act = act_info.activation();
- ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
- && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
- && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+ ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+ act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+ act !=
+ ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
}
- if(nullptr != output)
+ if (nullptr != output)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -131,17 +134,18 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
- if(beta != nullptr)
+ if (beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
}
- if(gamma != nullptr)
+ if (gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
}
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+ input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
return Status{};
}
@@ -169,10 +173,12 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win
// Only compute denominator and constants once per feature map.
int slice = -1;
- const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma =
+ (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta =
+ (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
T mean = static_cast<T>(0);
T var = static_cast<T>(0);
@@ -186,80 +192,83 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win
auto beta_vec = wrapper::vdup_n(beta, ExactTagType{});
auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
const auto epsilon_vec = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
- execute_window_loop(win_to_use, [&](const Coordinates & id)
- {
- const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
- if(slice != id.z())
+ execute_window_loop(
+ win_to_use,
+ [&](const Coordinates &id)
{
- mean = input_mean[id.z()];
- var = input_var[id.z()];
- mean_vec = wrapper::vdup_n(mean, ExactTagType{});
- var_vec = wrapper::vdup_n(var, ExactTagType{});
- if(input_gamma != nullptr)
- {
- gamma = input_gamma[id.z()];
- gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
- }
- if(input_beta != nullptr)
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+ if (slice != id.z())
{
- beta = input_beta[id.z()];
- beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+ mean = input_mean[id.z()];
+ var = input_var[id.z()];
+ mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+ var_vec = wrapper::vdup_n(var, ExactTagType{});
+ if (input_gamma != nullptr)
+ {
+ gamma = input_gamma[id.z()];
+ gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+ }
+ if (input_beta != nullptr)
+ {
+ beta = input_beta[id.z()];
+ beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+ }
+
+ // Calculate denominator
+ denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+ denominator = wrapper::vgetlane(denominator_vec, 0);
+ slice = id.z();
}
- // Calculate denominator
- denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
- denominator = wrapper::vgetlane(denominator_vec, 0);
- slice = id.z();
- }
-
- // Perform core calculations using vector operations
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Calculate x bar
- const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
- const auto x_bar = wrapper::vmul(numerator, denominator_vec);
- auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
- // Perform fused activation
- if(fused_activation)
+ // Perform core calculations using vector operations
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- activation_functor(res);
+ // Calculate x bar
+ const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+ const auto x_bar = wrapper::vmul(numerator, denominator_vec);
+ auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+ // Perform fused activation
+ if (fused_activation)
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ wrapper::vstore(output_ptr + x, res);
}
- // Store results
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const T numerator = input_ptr[x] - mean;
- const T x_bar = numerator * denominator;
- T res = beta + x_bar * gamma;
-
- // Perform fused activation
- if(fused_activation)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- activation_functor(res);
+ const T numerator = input_ptr[x] - mean;
+ const T x_bar = numerator * denominator;
+ T res = beta + x_bar * gamma;
+
+ // Perform fused activation
+ if (fused_activation)
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ *(output_ptr + x) = res;
}
-
- // Store results
- *(output_ptr + x) = res;
- }
- },
- input, output);
+ },
+ input, output);
}
void NEBatchNormalizationLayerKernel::configure_non_fused()
{
- switch(_input->info()->data_type())
+ switch (_input->info()->data_type())
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
+ _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false,
+ detail::dummy<float16_t, 8>>;
break;
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
@@ -274,23 +283,25 @@ void NEBatchNormalizationLayerKernel::configure_non_fused()
void NEBatchNormalizationLayerKernel::configure_fused()
{
// NCHW Fused Batched Normalization with activation functions : FP32
- static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
- {
- { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> }
- };
+ static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw = {
+ {ActivationLayerInfo::ActivationFunction::RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>>},
+ {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>>},
+ {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>>}};
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
// NCHW Fused Batched Normalization with activation functions : FP16
- static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
- {
- { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> }
- };
+ static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw = {
+ {ActivationLayerInfo::ActivationFunction::RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>>},
+ {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>>},
+ {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>>}};
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- switch(_input->info()->data_type())
+ switch (_input->info()->data_type())
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
@@ -307,22 +318,32 @@ void NEBatchNormalizationLayerKernel::configure_fused()
}
NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info()
+ : _func(nullptr),
+ _input(nullptr),
+ _output(nullptr),
+ _mean(nullptr),
+ _var(nullptr),
+ _gamma(nullptr),
+ _beta(nullptr),
+ _epsilon(),
+ _act_info()
{
}
-void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
- const ITensor *mean, const ITensor *var,
- const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo act_info)
+void NEBatchNormalizationLayerKernel::configure(ITensor *input,
+ ITensor *output,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
- mean->info(), var->info(),
- (beta != nullptr) ? beta->info() : nullptr,
- (gamma != nullptr) ? gamma->info() : nullptr,
- epsilon, act_info));
+ mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
+ (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
_input = input;
_output = input;
@@ -334,16 +355,16 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
_act_info = act_info;
const bool run_in_place = (output == nullptr) || (output == input);
- if(!run_in_place)
+ if (!run_in_place)
{
_output = output;
}
// Configure activation function to run
const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
- if(is_nchw)
+ if (is_nchw)
{
- if(_act_info.enabled())
+ if (_act_info.enabled())
{
configure_fused();
}
@@ -357,17 +378,21 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
Window win = calculate_max_window(*input->info(), Steps());
INEKernel::configure(win);
- if(output != nullptr)
+ if (output != nullptr)
{
// Output auto initialization if not yet initialized
auto_init_if_empty(*output->info(), *input->info()->clone());
}
}
-Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
@@ -382,13 +407,14 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo
ARM_COMPUTE_ERROR_ON(_func == nullptr && _input->info()->data_layout() == DataLayout::NCHW);
const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
- if(is_nchw)
+ if (is_nchw)
{
(this->*_func)(window);
}
else
{
- const auto *uk = get_implementation(BatchNormalizationSelectorData{ _input->info()->data_type(), CPUInfo::get() });
+ const auto *uk =
+ get_implementation(BatchNormalizationSelectorData{_input->info()->data_type(), CPUInfo::get()});
uk->ukernel(_input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info, window);
}
}
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 0551ace30c..2e8ff0dc9a 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -68,7 +69,13 @@ public:
* @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
*/
- void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+ void configure(ITensor *input,
+ ITensor *output,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta = nullptr,
+ const ITensor *gamma = nullptr,
+ float epsilon = 0.001f,
ActivationLayerInfo act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel
*
@@ -85,10 +92,14 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
- float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta = nullptr,
+ const ITensorInfo *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
index 83fb5f6f51..f299bb94a4 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
@@ -27,8 +27,9 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
return Status{};
}
-Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status validate_arguments_static(const ITensorInfo *input,
+ int block_shape_x,
+ int block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -65,13 +70,14 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in
const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
- const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+ const TensorShape expected_output_shape = compute_batch_to_space_shape(
+ input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+ const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
}
@@ -80,7 +86,13 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in
} // namespace
NEBatchToSpaceLayerKernel::NEBatchToSpaceLayerKernel()
- : _input(nullptr), _block_shape(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _block_shape_x(), _block_shape_y(), _crop_info()
+ : _input(nullptr),
+ _block_shape(nullptr),
+ _output(nullptr),
+ _data_layout(DataLayout::UNKNOWN),
+ _block_shape_x(),
+ _block_shape_y(),
+ _crop_info()
{
}
@@ -99,15 +111,18 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const ITensor *b
ICPPKernel::configure(win);
}
-void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
+void NEBatchToSpaceLayerKernel::configure(
+ const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+ const TensorShape output_shape = compute_batch_to_space_shape(
+ input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
// Output auto initialization if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
_input = input;
_output = output;
@@ -121,14 +136,19 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_sh
ICPPKernel::configure(win);
}
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
return Status{};
}
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
@@ -141,7 +161,7 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- if(_block_shape != nullptr)
+ if (_block_shape != nullptr)
{
// Retrieve the block shapes dynamically
_block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
@@ -155,31 +175,32 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
int batch_id = 0;
// Main loop for NCHW and NHWC
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
-
- const int x = id.x();
- const int y = id.y();
- const int z = id.z();
- // Translate x, y to uncropped version
- const int x_c = x + _crop_info.left;
- const int y_c = y + _crop_info.top;
-
- const int in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
- const int in_x = x_c / _block_shape_x;
- const int in_y = y_c / _block_shape_y;
- Coordinates input_coords{ in_x, in_y, z, in_batch };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- },
- out);
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
+ {
+ const int x = id.x();
+ const int y = id.y();
+ const int z = id.z();
+ // Translate x, y to uncropped version
+ const int x_c = x + _crop_info.left;
+ const int y_c = y + _crop_info.top;
+
+ const int in_batch =
+ batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+ const int in_x = x_c / _block_shape_x;
+ const int in_y = y_c / _block_shape_y;
+ Coordinates input_coords{in_x, in_y, z, in_batch};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
else
{
@@ -188,26 +209,28 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
-
- const int x = id.y();
- const int y = id.z();
-
- // Translate x, y to uncropped version
- const int x_c = x + _crop_info.left;
- const int y_c = y + _crop_info.top;
-
- const int in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
- const int in_x = x_c / _block_shape_x;
- const int in_y = y_c / _block_shape_y;
- Coordinates input_coords{ 0, in_x, in_y, in_batch };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size * _input->info()->dimension(0));
- },
- out);
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
+ {
+ const int x = id.y();
+ const int y = id.z();
+
+ // Translate x, y to uncropped version
+ const int x_c = x + _crop_info.left;
+ const int y_c = y + _crop_info.top;
+
+ const int in_batch =
+ batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+ const int in_x = x_c / _block_shape_x;
+ const int in_y = y_c / _block_shape_y;
+ Coordinates input_coords{0, in_x, in_y, in_batch};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords),
+ element_size * _input->info()->dimension(0));
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
index 5eceee0904..d98ac621b0 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -68,7 +69,11 @@ public:
* @param[out] output Tensor output. Data types supported: same as @p input
* @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
*/
- void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info = CropInfo{});
+ void configure(const ITensor *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ ITensor *output,
+ const CropInfo &crop_info = CropInfo{});
/** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -90,7 +95,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{});
+ static Status validate(const ITensorInfo *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info = CropInfo{});
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index 677c5cddcc..a59bbd233b 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -27,9 +27,10 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
#include <cstdint>
@@ -55,8 +56,7 @@ inline void bitwise_and(const T *__restrict input1, const T *__restrict input2,
}
} // namespace
-NEBitwiseAndKernel::NEBitwiseAndKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseAndKernel::NEBitwiseAndKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
}
@@ -86,8 +86,7 @@ void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2,
Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win,
- AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+ update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
output_access);
@@ -103,9 +102,7 @@ void NEBitwiseAndKernel::run(const Window &window, const ThreadInfo &info)
Iterator input2(_input2, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates &)
- {
- bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr());
- },
- input1, input2, output);
+ execute_window_loop(
+ window, [&](const Coordinates &) { bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+ input2, output);
}
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 19b1af690a..ecd181a7af 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -50,8 +51,7 @@ inline void bitwise_not_U8_U8(const uint8_t *__restrict input, uint8_t *__restri
}
} // namespace
-NEBitwiseNotKernel::NEBitwiseNotKernel()
- : _input(nullptr), _output(nullptr)
+NEBitwiseNotKernel::NEBitwiseNotKernel() : _input(nullptr), _output(nullptr)
{
}
@@ -77,7 +77,8 @@ void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output)
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access);
+ update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+ output_access);
INEKernel::configure(win);
}
@@ -90,9 +91,6 @@ void NEBitwiseNotKernel::run(const Window &window, const ThreadInfo &info)
Iterator input(_input, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates &)
- {
- bitwise_not_U8_U8(input.ptr(), output.ptr());
- },
- input, output);
+ execute_window_loop(
+ window, [&](const Coordinates &) { bitwise_not_U8_U8(input.ptr(), output.ptr()); }, input, output);
}
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index 08094fbfcf..4c906134aa 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -42,7 +43,8 @@ class Coordinates;
namespace
{
-inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
{
const uint8x16_t val1 = vld1q_u8(input1);
const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@ inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t
}
} // namespace
-NEBitwiseOrKernel::NEBitwiseOrKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseOrKernel::NEBitwiseOrKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
}
@@ -82,8 +83,7 @@ void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2,
Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win,
- AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+ update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
output_access);
@@ -99,9 +99,7 @@ void NEBitwiseOrKernel::run(const Window &window, const ThreadInfo &info)
Iterator input2(_input2, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates &)
- {
- bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
- },
- input1, input2, output);
+ execute_window_loop(
+ window, [&](const Coordinates &) { bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+ input2, output);
}
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index fc5b38b64f..dbbed2483c 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -42,7 +43,8 @@ class Coordinates;
namespace
{
-inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
{
const uint8x16_t val1 = vld1q_u8(input1);
const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@ inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t
}
} // namespace
-NEBitwiseXorKernel::NEBitwiseXorKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseXorKernel::NEBitwiseXorKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
}
@@ -82,7 +83,8 @@ void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2,
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
- AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access);
+ AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+ output_access);
INEKernel::configure(win);
}
@@ -96,9 +98,7 @@ void NEBitwiseXorKernel::run(const Window &window, const ThreadInfo &info)
Iterator input2(_input2, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates &)
- {
- bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
- },
- input1, input2, output);
+ execute_window_loop(
+ window, [&](const Coordinates &) { bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+ input2, output);
}
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 69bfd56ce0..cb869838e2 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -27,8 +27,9 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/boundingboxtransform/list.h"
@@ -45,7 +46,11 @@ struct BoundingBoxTransformSelectorData
};
using BoundingBoxTransformSelctorPtr = std::add_pointer<bool(const BoundingBoxTransformSelectorData &data)>::type;
-using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)>::type;
+using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ BoundingBoxTransformInfo bbinfo,
+ const Window &window)>::type;
struct BoundingBoxTransformKernel
{
@@ -54,26 +59,19 @@ struct BoundingBoxTransformKernel
BoundingBoxTransformUKernelPtr ukernel;
};
-static const BoundingBoxTransformKernel available_kernels[] =
-{
- {
- "fp32_neon_boundingboxtransform",
- [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)
- },
+static const BoundingBoxTransformKernel available_kernels[] = {
+ {"fp32_neon_boundingboxtransform",
+ [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)},
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "fp16_neon_boundingboxtransform",
- [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)
- },
+ {"fp16_neon_boundingboxtransform",
+ [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)},
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#if defined(ARM_COMPUTE_ENABLE_NEON)
- {
- "qu16_neon_boundingboxtransform",
- [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; },
- REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)
- },
+ {"qu16_neon_boundingboxtransform",
+ [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; },
+ REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)},
#endif //defined(ARM_COMPUTE_ENABLE_NEON)
};
@@ -85,9 +83,9 @@ static const BoundingBoxTransformKernel available_kernels[] =
*/
const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -95,7 +93,10 @@ const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformS
return nullptr;
}
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(boxes);
@@ -108,7 +109,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0);
- if(boxes->data_type() == DataType::QASYMM16)
+ if (boxes->data_type() == DataType::QASYMM16)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(deltas, 1, DataType::QASYMM8);
const UniformQuantizationInfo deltas_qinfo = deltas->quantization_info().uniform();
@@ -120,12 +121,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
}
- if(pred_boxes->total_size() > 0)
+ if (pred_boxes->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas);
ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
- if(pred_boxes->data_type() == DataType::QASYMM16)
+ if (pred_boxes->data_type() == DataType::QASYMM16)
{
const UniformQuantizationInfo pred_qinfo = pred_boxes->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(pred_qinfo.scale != 0.125f);
@@ -142,13 +143,19 @@ NEBoundingBoxTransformKernel::NEBoundingBoxTransformKernel()
{
}
-void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransformKernel::configure(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
// Configure kernel window
- auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+ auto_init_if_empty(*pred_boxes->info(), deltas->info()
+ ->clone()
+ ->set_data_type(boxes->info()->data_type())
+ .set_quantization_info(boxes->info()->quantization_info()));
// Set instance variables
_boxes = boxes;
@@ -164,7 +171,10 @@ void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred
INEKernel::configure(win);
}
-Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
return Status{};
@@ -176,7 +186,7 @@ void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &i
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() });
+ const auto *uk = get_implementation(BoundingBoxTransformSelectorData{_boxes->info()->data_type()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window);
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
index def827836c..3915994feb 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
@@ -63,7 +63,8 @@ public:
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
*
*/
- void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+ void
+ configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
*
@@ -77,7 +78,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+ static Status validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index 64da1f2262..3b53b7055f 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
@@ -30,6 +30,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -44,15 +45,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
- const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+ const unsigned int channels =
+ input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ num_groups == channels,
+ "Channel shuffling with same number of groups as number of channels would be inefficient");
ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); // There cannot be more groups than channels
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+ "The number of channels must be a multiple of the number of groups");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -72,20 +77,22 @@ void channel_shuffle_nhwc(const ITensor *input, ITensor *output, unsigned int nu
Iterator in(input, window);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Shuffle channel
- const unsigned int curr_channel = id.x();
- const unsigned int group_id = curr_channel * rK;
- const unsigned int r = group_id * K;
- const unsigned int channel_id = curr_channel - r;
-
- // Calculate output coordinates
- Coordinates out_coords = id;
- out_coords.set(Window::DimX, channel_id * num_groups + group_id);
- std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
- },
- in);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ // Shuffle channel
+ const unsigned int curr_channel = id.x();
+ const unsigned int group_id = curr_channel * rK;
+ const unsigned int r = group_id * K;
+ const unsigned int channel_id = curr_channel - r;
+
+ // Calculate output coordinates
+ Coordinates out_coords = id;
+ out_coords.set(Window::DimX, channel_id * num_groups + group_id);
+ std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
+ },
+ in);
}
void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window)
{
@@ -107,34 +114,35 @@ void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int nu
Iterator in(input, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- // Shuffle channel
- const unsigned int curr_channel = id.z();
- const unsigned int group_id = curr_channel * rK;
- const unsigned int r = group_id * K;
- const unsigned int channel_id = curr_channel - r;
-
- // Calculate output coordinates
- Coordinates out_coords = id;
- out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
- const uint8_t *input_ptr = in.ptr();
- uint8_t *output_ptr = output->ptr_to_element(out_coords);
-
- // Copy plane
- for(unsigned int y = 0; y < height; ++y)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- std::copy_n(input_ptr, row_size, output_ptr);
- input_ptr += input_stride_y;
- output_ptr += output_stride_y;
- }
- },
- in);
+ // Shuffle channel
+ const unsigned int curr_channel = id.z();
+ const unsigned int group_id = curr_channel * rK;
+ const unsigned int r = group_id * K;
+ const unsigned int channel_id = curr_channel - r;
+
+ // Calculate output coordinates
+ Coordinates out_coords = id;
+ out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
+ const uint8_t *input_ptr = in.ptr();
+ uint8_t *output_ptr = output->ptr_to_element(out_coords);
+
+ // Copy plane
+ for (unsigned int y = 0; y < height; ++y)
+ {
+ std::copy_n(input_ptr, row_size, output_ptr);
+ input_ptr += input_stride_y;
+ output_ptr += output_stride_y;
+ }
+ },
+ in);
}
} // namespace
-NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel()
- : _input(nullptr), _output(nullptr), _num_groups()
+NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr), _num_groups()
{
}
@@ -158,7 +166,8 @@ void NEChannelShuffleLayerKernel::configure(const ITensor *input, ITensor *outpu
INEKernel::configure(win);
}
-Status NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
return Status{};
@@ -170,7 +179,7 @@ void NEChannelShuffleLayerKernel::run(const Window &window, const ThreadInfo &in
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- switch(_input->info()->data_layout())
+ switch (_input->info()->data_layout())
{
case DataLayout::NHWC:
channel_shuffle_nhwc(_input, _output, _num_groups, window);
diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h
index 1976302036..bc6652fd30 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.h
+++ b/src/core/NEON/kernels/NECol2ImKernel.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_NECOL2IMKERNEL_H
#define ARM_COMPUTE_NECOL2IMKERNEL_H
-#include "src/core/NEON/INEKernel.h"
-
#include "arm_compute/core/Size2D.h"
+#include "src/core/NEON/INEKernel.h"
+
namespace arm_compute
{
class ITensor;
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index 94c455305c..60271fbc74 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -26,14 +26,15 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Window.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/utils/helpers/bit_ops.h"
#include "src/cpu/kernels/crop/list.h"
@@ -47,7 +48,8 @@ struct CropSelectorData
};
using CropSelectorPtr = std::add_pointer<bool(const CropSelectorData &data)>::type;
-using CropUKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
+using CropUKernelPtr = std::add_pointer<void(
+ const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
struct CropUKernel
{
@@ -56,48 +58,23 @@ struct CropUKernel
CropUKernelPtr ukernel;
};
-static const CropUKernel available_kernels[] =
-{
- {
- "fp16_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)
- },
- {
- "f32_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)
- },
- {
- "u8_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::U8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)
- },
- {
- "u16_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::U16; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)
- },
- {
- "u32_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::U32; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)
- },
- {
- "s8_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::S8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)
- },
- {
- "s16_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::S16; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)
- },
- {
- "s32_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::S32; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)
- },
+static const CropUKernel available_kernels[] = {
+ {"fp16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)},
+ {"f32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)},
+ {"u8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)},
+ {"u16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)},
+ {"u32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)},
+ {"s8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)},
+ {"s16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)},
+ {"s32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)},
};
/** Micro-kernel selector
@@ -108,9 +85,9 @@ static const CropUKernel available_kernels[] =
*/
const CropUKernel *get_implementation(const CropSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -119,26 +96,40 @@ const CropUKernel *get_implementation(const CropSelectorData &data)
return nullptr;
}
-inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
+inline void out_of_bounds_crop_window(const ITensor *output,
+ float *output_ptr,
+ float extrapolation_value,
+ int32_t window_step_x,
+ int32_t output_width_start,
+ int32_t output_width_limit)
{
- auto in = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
- int32_t x = 0;
- int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
- float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
- for(; x <= limit - window_step_x; x += window_step_x)
+ auto in = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
+ int32_t x = 0;
+ int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+ float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+ for (; x <= limit - window_step_x; x += window_step_x)
{
wrapper::vstore(output_start_ptr + x, in);
}
- for(; x < limit; ++x)
+ for (; x < limit; ++x)
{
*(output_start_ptr + x) = extrapolation_value;
}
}
-inline void execute_window(const ITensor *input, const ITensor *output, Coordinates input_offset, float extrapolation_value,
- const std::array<uint32_t, 2> &rows_out_of_bounds, const std::array<uint32_t, 2> &cols_out_of_bounds, NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
- bool is_height_flipped, bool has_cols_in_bounds, bool has_cols_out_of_bounds_before, bool has_cols_out_of_bounds_after, bool input_has_single_channel, bool is_width_flipped)
+inline void execute_window(const ITensor *input,
+ const ITensor *output,
+ Coordinates input_offset,
+ float extrapolation_value,
+ const std::array<uint32_t, 2> &rows_out_of_bounds,
+ const std::array<uint32_t, 2> &cols_out_of_bounds,
+ NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
+ bool is_height_flipped,
+ bool has_cols_in_bounds,
+ bool has_cols_out_of_bounds_before,
+ bool has_cols_out_of_bounds_after,
+ bool input_has_single_channel,
+ bool is_width_flipped)
{
// Output is always float.
const int window_step_x = 16 / sizeof(float);
@@ -159,45 +150,66 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina
// |------------------------------|
// Fill all output rows that have no elements that are within the input bounds with the extrapolation value.
// First for the rows before the in bounds rows.
- out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1));
+ out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+ rows_out_of_bounds[0] * output->info()->dimension(1));
output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0);
// Iterate through each row that has any elements within the input bounds.
- for(uint32_t row = rows_out_of_bounds[0]; static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
- ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
+ for (uint32_t row = rows_out_of_bounds[0];
+ static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
+ ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
{
// Fill all elements in the row that are out of bounds with the extrapolation value.
// First for the elements before the in bounds elements.
- if(has_cols_out_of_bounds_before)
+ if (has_cols_out_of_bounds_before)
{
out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]);
}
// Copy all elements within the input bounds from the input tensor.
- if(has_cols_in_bounds)
+ if (has_cols_in_bounds)
{
(*in_bounds_crop_function)(input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0],
- output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, is_width_flipped);
+ output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel,
+ is_width_flipped);
}
// Fill all elements after the in bounds elements with the extrapolation value.
- if(has_cols_out_of_bounds_after)
+ if (has_cols_out_of_bounds_after)
{
- out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1));
+ out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x,
+ output->info()->dimension(1) - cols_out_of_bounds[1],
+ output->info()->dimension(1));
}
output_ptr += output->info()->dimension(1) * output->info()->dimension(0);
}
// Fill all rows after the in bounds elements with the extrapolation value.
- out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1));
+ out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+ rows_out_of_bounds[1] * output->info()->dimension(1));
}
} // namespace
NECropKernel::NECropKernel()
- : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds()
+ : _input(nullptr),
+ _crop_boxes(nullptr),
+ _box_ind(nullptr),
+ _output(nullptr),
+ _start(),
+ _end(),
+ _crop_box_ind(0),
+ _extrapolation_value(0),
+ _rows_out_of_bounds(),
+ _cols_out_of_bounds()
{
}
-void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind, float extrapolation_value)
+void NECropKernel::configure(const ITensor *input,
+ const ITensor *crop_boxes,
+ const ITensor *box_ind,
+ ITensor *output,
+ uint32_t crop_box_ind,
+ float extrapolation_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), crop_box_ind, extrapolation_value));
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(),
+ crop_box_ind, extrapolation_value));
_input = input;
_crop_boxes = crop_boxes;
@@ -207,21 +219,27 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co
_extrapolation_value = extrapolation_value;
}
-Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value)
+Status NECropKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *crop_boxes,
+ const ITensorInfo *box_ind,
+ const ITensorInfo *output,
+ uint32_t crop_box_ind,
+ float extrapolation_value)
{
ARM_COMPUTE_UNUSED(extrapolation_value);
- const auto *uk = get_implementation(CropSelectorData{ input->data_type() });
+ const auto *uk = get_implementation(CropSelectorData{input->data_type()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16,
+ DataType::F16, DataType::U32, DataType::S32, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4);
ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] <= crop_box_ind);
ARM_COMPUTE_RETURN_ERROR_ON(box_ind->tensor_shape()[0] <= crop_box_ind);
- if(output->total_size() > 0)
+ if (output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -242,48 +260,53 @@ void NECropKernel::configure_output_shape()
// The normalized coordiantes are scaled to retrieve the floating point image coordinates which are rounded to integers.
_start = Coordinates(std::floor(x0 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
std::floor(y0 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
- _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
- std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
- const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, abs(_end[1] - _start[1]) + 1);
+ _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
+ std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
+ const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1,
+ abs(_end[1] - _start[1]) + 1);
_output->info()->set_tensor_shape(out_shape);
bool is_width_flipped = _end[0] < _start[0];
bool is_height_flipped = _end[1] < _start[1];
- if(is_height_flipped)
+ if (is_height_flipped)
{
- _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
- static_cast<uint32_t>(_output->info()->dimension(2))) :
- 0;
+ _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+ ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(2)))
+ : 0;
_rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast<uint32_t>(-_end[1]),
- static_cast<uint32_t>(_output->info()->dimension(2))) :
- 0;
+ static_cast<uint32_t>(_output->info()->dimension(2)))
+ : 0;
}
else
{
_rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast<uint32_t>(-_start[1]),
- static_cast<uint32_t>(_output->info()->dimension(2))) :
- 0;
- _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
- static_cast<uint32_t>(_output->info()->dimension(2))) :
- 0;
+ static_cast<uint32_t>(_output->info()->dimension(2)))
+ : 0;
+ _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+ ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(2)))
+ : 0;
}
- if(is_width_flipped)
+ if (is_width_flipped)
{
- _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
- static_cast<uint32_t>(_output->info()->dimension(1))) :
- 0;
+ _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+ ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(1)))
+ : 0;
_cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast<uint32_t>(-_end[0]),
- static_cast<uint32_t>(_output->info()->dimension(1))) :
- 0;
+ static_cast<uint32_t>(_output->info()->dimension(1)))
+ : 0;
}
else
{
_cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast<uint32_t>(-_start[0]),
- static_cast<uint32_t>(_output->info()->dimension(1))) :
- 0;
- _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
- static_cast<uint32_t>(_output->info()->dimension(1))) :
- 0;
+ static_cast<uint32_t>(_output->info()->dimension(1)))
+ : 0;
+ _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+ ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(1)))
+ : 0;
}
INEKernel::configure(calculate_max_window(*_output->info()));
@@ -298,13 +321,18 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON(_input->info()->has_padding());
ARM_COMPUTE_ERROR_ON(_output->info()->has_padding());
- const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() });
+ const auto *uk = get_implementation(CropSelectorData{_input->info()->data_type()});
uint32_t batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind))));
- Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
- _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
- execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1],
- _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0,
+ Coordinates input_offset(
+ 0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
+ _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
+ execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds,
+ uk->ukernel,
+ _end[1]<_start[1],
+ _cols_out_of_bounds[0] +
+ _cols_out_of_bounds[1]<_output->info()->dimension(1), _cols_out_of_bounds[0]> 0,
+ _cols_out_of_bounds[1]> 0,
_start[0] <= _end[0], _end[0] < _start[0]);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h
index 6c989c1d2c..da4a1b26e5 100644
--- a/src/core/NEON/kernels/NECropKernel.h
+++ b/src/core/NEON/kernels/NECropKernel.h
@@ -25,7 +25,7 @@
#define ARM_COMPUTE_NEON_CROP_KERNEL_H
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -67,7 +67,12 @@ public:
* @param[in] crop_box_ind Index of the crop box to be used from @p crop_boxes. Default is 0.
* @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
*/
- void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+ void configure(const ITensor *input,
+ const ITensor *crop_boxes,
+ const ITensor *box_ind,
+ ITensor *output,
+ uint32_t crop_box_ind = 0,
+ float extrapolation_value = 0);
/** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
*
@@ -82,7 +87,12 @@ public:
* @param[in] crop_box_ind Index of the crop box to be used from @p crop_boxes. Default is 0.
* @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *crop_boxes,
+ const ITensorInfo *box_ind,
+ const ITensorInfo *output,
+ uint32_t crop_box_ind = 0,
+ float extrapolation_value = 0);
/** Configure output tensor's shape as this can only be determined at runtime. */
void configure_output_shape();
@@ -91,7 +101,8 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
/** Function to use for in bounds crop for the particular tensor types passed to configure() */
- using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
+ using InBoundsCropFunction =
+ void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
private:
const ITensor *_input;
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index 6dcc85ec2e..de0079ee60 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -26,11 +26,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
#include <cstdint>
@@ -52,12 +53,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+ (block_shape * input->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+ (block_shape * input->tensor_shape()[idx_height]));
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -74,7 +77,8 @@ NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+ TensorShape output_shape =
+ compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -117,26 +121,27 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
// Main loop for NCHW and NHWC
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
Window slice_in = window.first_slice_window_2D();
do
{
Iterator in(_input, slice_in);
- execute_window_loop(slice_in, [&](const Coordinates & id)
- {
- const int x = id.x();
- const int y = id.y();
-
- const int z = id.z() % r;
- const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
- Coordinates output_coords{ out_x, out_y, z, id[3] };
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- }
- while(window.slide_window_slice_2D(slice_in));
+ execute_window_loop(
+ slice_in,
+ [&](const Coordinates &id)
+ {
+ const int x = id.x();
+ const int y = id.y();
+
+ const int z = id.z() % r;
+ const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
+ const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
+ Coordinates output_coords{out_x, out_y, z, id[3]};
+ memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+ },
+ in);
+ } while (window.slide_window_slice_2D(slice_in));
}
else
{
@@ -144,20 +149,21 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
do
{
Iterator in(_input, slice_in);
- execute_window_loop(slice_in, [&](const Coordinates & id)
- {
- const int x = id.y();
- const int y = id.z();
-
- const int z = id.x() % r;
- const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
- Coordinates output_coords{ z, out_x, out_y, id[3] };
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- }
- while(window.slide_window_slice_3D(slice_in));
+ execute_window_loop(
+ slice_in,
+ [&](const Coordinates &id)
+ {
+ const int x = id.y();
+ const int y = id.z();
+
+ const int z = id.x() % r;
+ const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
+ const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
+ Coordinates output_coords{z, out_x, out_y, id[3]};
+ memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+ },
+ in);
+ } while (window.slide_window_slice_3D(slice_in));
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
index 261437f07d..a5969cd497 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -37,16 +38,19 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -56,7 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input,
+ ITensorInfo *output,
+ ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_UNUSED(idx, config);
@@ -68,12 +75,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
}
} // namespace
-NEFFTDigitReverseKernel::NEFFTDigitReverseKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
+NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
{
}
-void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config)
+void NEFFTDigitReverseKernel::configure(const ITensor *input,
+ ITensor *output,
+ const ITensor *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
@@ -91,11 +100,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
- if(axis == 0)
+ if (axis == 0)
{
- if(is_input_complex)
+ if (is_input_complex)
{
- if(is_conj)
+ if (is_conj)
{
_func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<true, true>;
}
@@ -109,11 +118,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
_func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<false, false>;
}
}
- else if(axis == 1)
+ else if (axis == 1)
{
- if(is_input_complex)
+ if (is_input_complex)
{
- if(is_conj)
+ if (is_conj)
{
_func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<true, true>;
}
@@ -133,10 +142,14 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
}
}
-Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
return Status{};
}
@@ -159,38 +172,40 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0(const Window &window)
std::vector<float> buffer_row_out(2 * N);
std::vector<float> buffer_row_in(2 * N);
- execute_window_loop(slice, [&](const Coordinates &)
- {
- if(is_input_complex)
+ execute_window_loop(
+ slice,
+ [&](const Coordinates &)
{
- // Load
- memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
-
- // Shuffle
- for(size_t x = 0; x < 2 * N; x += 2)
+ if (is_input_complex)
{
- size_t idx = buffer_idx[x / 2];
- buffer_row_out[x] = buffer_row_in[2 * idx];
- buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
- }
- }
- else
- {
- // Load
- memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+ // Load
+ memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
- // Shuffle
- for(size_t x = 0; x < N; ++x)
+ // Shuffle
+ for (size_t x = 0; x < 2 * N; x += 2)
+ {
+ size_t idx = buffer_idx[x / 2];
+ buffer_row_out[x] = buffer_row_in[2 * idx];
+ buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
+ }
+ }
+ else
{
- size_t idx = buffer_idx[x];
- buffer_row_out[2 * x] = buffer_row_in[idx];
+ // Load
+ memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+
+ // Shuffle
+ for (size_t x = 0; x < N; ++x)
+ {
+ size_t idx = buffer_idx[x];
+ buffer_row_out[2 * x] = buffer_row_in[idx];
+ }
}
- }
- // Copy back
- memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
- },
- in, out);
+ // Copy back
+ memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
+ },
+ in, out);
}
template <bool is_input_complex, bool is_conj>
@@ -215,39 +230,41 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1(const Window &window)
const size_t stride_z = _input->info()->strides_in_bytes()[2];
const size_t stride_w = _input->info()->strides_in_bytes()[3];
- execute_window_loop(slice, [&](const Coordinates & id)
- {
- auto *out_ptr = reinterpret_cast<float *>(out.ptr());
- auto *in_ptr = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
- const size_t y_shuffled = buffer_idx[id.y()];
-
- if(is_input_complex)
+ execute_window_loop(
+ slice,
+ [&](const Coordinates &id)
{
- // Shuffle the entire row into the output
- memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+ auto *out_ptr = reinterpret_cast<float *>(out.ptr());
+ auto *in_ptr = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
+ const size_t y_shuffled = buffer_idx[id.y()];
- // Conjugate if necessary
- if(is_conj)
+ if (is_input_complex)
{
- for(size_t x = 0; x < 2 * Nx; x += 2)
+ // Shuffle the entire row into the output
+ memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+
+ // Conjugate if necessary
+ if (is_conj)
{
- out_ptr[x + 1] = -out_ptr[x + 1];
+ for (size_t x = 0; x < 2 * Nx; x += 2)
+ {
+ out_ptr[x + 1] = -out_ptr[x + 1];
+ }
}
}
- }
- else
- {
- // Shuffle the entire row into the buffer
- memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
-
- // Copy the buffer to the output, with a zero imaginary part
- for(size_t x = 0; x < 2 * Nx; x += 2)
+ else
{
- out_ptr[x] = buffer_row[x / 2];
+ // Shuffle the entire row into the buffer
+ memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
+
+ // Copy the buffer to the output, with a zero imaginary part
+ for (size_t x = 0; x < 2 * Nx; x += 2)
+ {
+ out_ptr[x] = buffer_row[x / 2];
+ }
}
- }
- },
- out);
+ },
+ out);
}
void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
index f436c364b2..ecf85ebc98 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -70,7 +71,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
index 44c841f626..4b58a7b9ac 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -28,10 +28,11 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "support/ToolchainSupport.h"
#include <arm_neon.h>
@@ -70,7 +71,7 @@ float32x2_t c_mul_neon(float32x2_t a, float32x2_t b)
{
using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
- const float32x2_t mask = { -1.0, 1.0 };
+ const float32x2_t mask = {-1.0, 1.0};
const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
@@ -88,7 +89,7 @@ float32x2_t c_mul_neon_img(float32x2_t a, float img_constant)
const float a_r = wrapper::vgetlane(a, 0);
const float a_i = wrapper::vgetlane(a, 1);
- const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant });
+ const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant});
return out;
}
@@ -100,7 +101,8 @@ float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_
return wrapper::vadd(t2, e);
}
-float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
+float32x2_t reduce_sum_7(
+ float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
{
const auto t0 = wrapper::vadd(x1, x2);
const auto t1 = wrapper::vadd(x3, x4);
@@ -111,7 +113,14 @@ float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32
return wrapper::vadd(t00, t01);
}
-float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8)
+float32x2_t reduce_sum_8(float32x2_t x1,
+ float32x2_t x2,
+ float32x2_t x3,
+ float32x2_t x4,
+ float32x2_t x5,
+ float32x2_t x6,
+ float32x2_t x7,
+ float32x2_t x8)
{
const auto t0 = wrapper::vadd(x1, x2);
const auto t1 = wrapper::vadd(x3, x4);
@@ -141,15 +150,21 @@ void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w,
x = wrapper::vadd(a, b);
x = wrapper::vadd(x, c);
- const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c));
- const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c));
+ const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c));
+ const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c));
y = z = wrapper::vsub(a, v1);
y = wrapper::vadd(y, v2);
z = wrapper::vsub(z, v2);
}
-void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3)
+void fft_4(float32x2_t &x1,
+ float32x2_t &x2,
+ float32x2_t &x3,
+ float32x2_t &x4,
+ const float32x2_t &w,
+ const float32x2_t &w2,
+ const float32x2_t &w3)
{
float32x2_t a = x1;
float32x2_t b = c_mul_neon(w, x2);
@@ -173,7 +188,15 @@ void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, c
x4 = wrapper::vadd(x41, x42);
}
-void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4)
+void fft_5(float32x2_t &x1,
+ float32x2_t &x2,
+ float32x2_t &x3,
+ float32x2_t &x4,
+ float32x2_t &x5,
+ const float32x2_t &w,
+ const float32x2_t &w2,
+ const float32x2_t &w3,
+ const float32x2_t &w4)
{
const auto a = x1;
const auto b = c_mul_neon(w, x2);
@@ -181,25 +204,25 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
const auto d = c_mul_neon(w3, x4);
const auto e = c_mul_neon(w4, x5);
- const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b);
- const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b);
- const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b);
- const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b);
+ const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b);
+ const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b);
+ const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b);
+ const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b);
- const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c);
- const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c);
- const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c);
- const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c);
+ const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c);
+ const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c);
+ const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c);
+ const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c);
- const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d);
- const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d);
- const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d);
- const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d);
+ const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d);
+ const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d);
+ const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d);
+ const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d);
- const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e);
- const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e);
- const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e);
- const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e);
+ const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e);
+ const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e);
+ const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e);
+ const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e);
x1 = reduce_sum_5(a, b, c, d, e);
x2 = reduce_sum_5(a, b0, c0, d0, e0);
@@ -208,9 +231,19 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
x5 = reduce_sum_5(a, b3, c3, d3, e3);
}
-void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3,
+void fft_7(float32x2_t &x1,
+ float32x2_t &x2,
+ float32x2_t &x3,
+ float32x2_t &x4,
+ float32x2_t &x5,
+ float32x2_t &x6,
+ float32x2_t &x7,
+ const float32x2_t &w,
+ const float32x2_t &w2,
+ const float32x2_t &w3,
const float32x2_t &w4,
- const float32x2_t &w5, const float32x2_t &w6)
+ const float32x2_t &w5,
+ const float32x2_t &w6)
{
const auto a = x1;
const auto b = c_mul_neon(w, x2);
@@ -220,47 +253,47 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
const auto f = c_mul_neon(w5, x6);
const auto g = c_mul_neon(w6, x7);
- const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b);
- const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b);
- const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b);
- const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b);
- const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b);
- const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b);
-
- const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c);
- const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c);
- const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c);
- const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c);
- const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c);
- const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c);
-
- const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d);
- const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d);
- const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d);
- const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d);
- const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d);
- const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d);
-
- const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e);
- const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e);
- const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e);
- const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e);
- const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e);
- const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e);
-
- const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f);
- const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f);
- const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f);
- const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f);
- const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f);
- const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f);
-
- const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g);
- const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g);
- const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g);
- const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g);
- const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g);
- const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g);
+ const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b);
+ const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b);
+ const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b);
+ const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b);
+ const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b);
+ const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b);
+
+ const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c);
+ const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c);
+ const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c);
+ const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c);
+ const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c);
+ const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c);
+
+ const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d);
+ const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d);
+ const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d);
+ const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d);
+ const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d);
+ const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d);
+
+ const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e);
+ const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e);
+ const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e);
+ const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e);
+ const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e);
+ const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e);
+
+ const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f);
+ const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f);
+ const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f);
+ const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f);
+ const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f);
+ const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f);
+
+ const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g);
+ const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g);
+ const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g);
+ const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g);
+ const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g);
+ const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g);
x1 = reduce_sum_7(a, b, c, d, e, f, g);
x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
@@ -271,9 +304,20 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
}
-void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2,
+void fft_8(float32x2_t &x1,
+ float32x2_t &x2,
+ float32x2_t &x3,
+ float32x2_t &x4,
+ float32x2_t &x5,
+ float32x2_t &x6,
+ float32x2_t &x7,
+ float32x2_t &x8,
+ const float32x2_t &w,
+ const float32x2_t &w2,
const float32x2_t &w3,
- const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6,
+ const float32x2_t &w4,
+ const float32x2_t &w5,
+ const float32x2_t &w6,
const float32x2_t &w7)
{
const auto a = x1;
@@ -285,61 +329,61 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
const auto g = c_mul_neon(w6, x7);
const auto h = c_mul_neon(w7, x8);
- const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b);
- const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b);
- const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b);
- const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b);
- const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b);
- const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b);
- const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b);
-
- const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c);
- const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c);
- const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c);
- const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c);
- const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c);
- const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c);
- const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c);
-
- const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d);
- const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d);
- const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d);
- const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d);
- const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d);
- const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d);
- const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d);
-
- const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e);
- const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e);
- const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e);
- const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e);
- const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e);
- const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e);
- const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-
- const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f);
- const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f);
- const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f);
- const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f);
- const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f);
- const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f);
- const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f);
-
- const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g);
- const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g);
- const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g);
- const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g);
- const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g);
- const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g);
- const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g);
-
- const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h);
- const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h);
- const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h);
- const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h);
- const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h);
- const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h);
- const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h);
+ const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b);
+ const auto b1 = c_mul_neon(float32x2_t{0, -1}, b);
+ const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b);
+ const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b);
+ const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b);
+ const auto b5 = c_mul_neon(float32x2_t{0, 1}, b);
+ const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b);
+
+ const auto c0 = c_mul_neon(float32x2_t{0, -1}, c);
+ const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c);
+ const auto c2 = c_mul_neon(float32x2_t{0, 1}, c);
+ const auto c3 = c_mul_neon(float32x2_t{1, 0}, c);
+ const auto c4 = c_mul_neon(float32x2_t{0, -1}, c);
+ const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c);
+ const auto c6 = c_mul_neon(float32x2_t{0, 1}, c);
+
+ const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d);
+ const auto d1 = c_mul_neon(float32x2_t{0, 1}, d);
+ const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d);
+ const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d);
+ const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d);
+ const auto d5 = c_mul_neon(float32x2_t{0, -1}, d);
+ const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d);
+
+ const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e);
+ const auto e1 = c_mul_neon(float32x2_t{1, 0}, e);
+ const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e);
+ const auto e3 = c_mul_neon(float32x2_t{1, 0}, e);
+ const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e);
+ const auto e5 = c_mul_neon(float32x2_t{1, 0}, e);
+ const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e);
+
+ const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f);
+ const auto f1 = c_mul_neon(float32x2_t{0, -1}, f);
+ const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f);
+ const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f);
+ const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f);
+ const auto f5 = c_mul_neon(float32x2_t{0, 1}, f);
+ const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f);
+
+ const auto g0 = c_mul_neon(float32x2_t{0, 1}, g);
+ const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g);
+ const auto g2 = c_mul_neon(float32x2_t{0, -1}, g);
+ const auto g3 = c_mul_neon(float32x2_t{1, 0}, g);
+ const auto g4 = c_mul_neon(float32x2_t{0, 1}, g);
+ const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g);
+ const auto g6 = c_mul_neon(float32x2_t{0, -1}, g);
+
+ const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h);
+ const auto h1 = c_mul_neon(float32x2_t{0, 1}, h);
+ const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h);
+ const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h);
+ const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h);
+ const auto h5 = c_mul_neon(float32x2_t{0, -1}, h);
+ const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h);
x1 = reduce_sum_8(a, b, c, d, e, f, g, h);
x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
@@ -352,18 +396,19 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
}
template <bool first_stage>
-void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_2_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
- auto a = float32x2_t{ 0, 0 };
- auto b = float32x2_t{ 0, 0 };
+ auto a = float32x2_t{0, 0};
+ auto b = float32x2_t{0, 0};
// Load inputs
- if(first_stage)
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
a = wrapper::vgetlow(ab);
@@ -379,7 +424,7 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
fft_2(a, b, w);
// Write outputs
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
}
@@ -394,12 +439,20 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_2_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -418,20 +471,21 @@ void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
}
template <bool first_stage>
-void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_3_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const auto w2 = c_mul_neon(w, w);
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
// Load inputs
- float32x2_t a = { 0, 0 };
- float32x2_t b = { 0, 0 };
- float32x2_t c = { 0, 0 };
- if(first_stage)
+ float32x2_t a = {0, 0};
+ float32x2_t b = {0, 0};
+ float32x2_t c = {0, 0};
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
a = wrapper::vgetlow(ab);
@@ -447,7 +501,7 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
// Base-case prime transform
fft_3(a, b, c, w, w2);
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
}
@@ -462,14 +516,22 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_3_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const auto w2 = c_mul_neon(w, w);
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -489,21 +551,22 @@ void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
}
template <bool first_stage>
-void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_4_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const auto w2 = c_mul_neon(w, w);
const auto w3 = c_mul_neon(w2, w);
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
- float32x2_t a = { 0, 0 };
- float32x2_t b = { 0, 0 };
- float32x2_t c = { 0, 0 };
- float32x2_t d = { 0, 0 };
- if(first_stage)
+ float32x2_t a = {0, 0};
+ float32x2_t b = {0, 0};
+ float32x2_t c = {0, 0};
+ float32x2_t d = {0, 0};
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -524,7 +587,7 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
// Base-case prime transform
fft_4(a, b, c, d, w, w2, w3);
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -542,15 +605,23 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_4_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const auto w2 = c_mul_neon(w, w);
const auto w3 = c_mul_neon(w2, w);
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -572,25 +643,26 @@ void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
}
template <bool first_stage>
-void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_5_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
const float32x2_t w4 = c_mul_neon(w3, w);
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
- float32x2_t a = { 0, 0 };
- float32x2_t b = { 0, 0 };
- float32x2_t c = { 0, 0 };
- float32x2_t d = { 0, 0 };
- float32x2_t e = { 0, 0 };
+ float32x2_t a = {0, 0};
+ float32x2_t b = {0, 0};
+ float32x2_t c = {0, 0};
+ float32x2_t d = {0, 0};
+ float32x2_t e = {0, 0};
// Load inputs
- if(first_stage)
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -613,7 +685,7 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
fft_5(a, b, c, d, e, w, w2, w3, w4);
// Store outputs
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -632,16 +704,24 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_5_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
const float32x2_t w4 = c_mul_neon(w3, w);
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -666,10 +746,11 @@ void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
}
template <bool first_stage>
-void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_7_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
@@ -677,18 +758,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
const float32x2_t w5 = c_mul_neon(w4, w);
const float32x2_t w6 = c_mul_neon(w5, w);
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
- float32x2_t a = { 0, 0 };
- float32x2_t b = { 0, 0 };
- float32x2_t c = { 0, 0 };
- float32x2_t d = { 0, 0 };
- float32x2_t e = { 0, 0 };
- float32x2_t f = { 0, 0 };
- float32x2_t g = { 0, 0 };
+ float32x2_t a = {0, 0};
+ float32x2_t b = {0, 0};
+ float32x2_t c = {0, 0};
+ float32x2_t d = {0, 0};
+ float32x2_t e = {0, 0};
+ float32x2_t f = {0, 0};
+ float32x2_t g = {0, 0};
// Load inputs
- if(first_stage)
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -715,7 +796,7 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
// Base-case prime transform
fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -737,10 +818,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_7_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
@@ -748,7 +837,7 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
const float32x2_t w5 = c_mul_neon(w4, w);
const float32x2_t w6 = c_mul_neon(w5, w);
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -777,10 +866,11 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
}
template <bool first_stage>
-void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_8_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
@@ -789,20 +879,20 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
const float32x2_t w6 = c_mul_neon(w5, w);
const float32x2_t w7 = c_mul_neon(w6, w);
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
// Load inputs
- float32x2_t a = { 0, 0 };
- float32x2_t b = { 0, 0 };
- float32x2_t c = { 0, 0 };
- float32x2_t d = { 0, 0 };
- float32x2_t e = { 0, 0 };
- float32x2_t f = { 0, 0 };
- float32x2_t g = { 0, 0 };
- float32x2_t h = { 0, 0 };
+ float32x2_t a = {0, 0};
+ float32x2_t b = {0, 0};
+ float32x2_t c = {0, 0};
+ float32x2_t d = {0, 0};
+ float32x2_t e = {0, 0};
+ float32x2_t f = {0, 0};
+ float32x2_t g = {0, 0};
+ float32x2_t h = {0, 0};
// Base-case prime transform
- if(first_stage)
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -834,7 +924,7 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
// Store outputs
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -858,10 +948,18 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_8_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
@@ -870,7 +968,7 @@ void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
const float32x2_t w6 = c_mul_neon(w5, w);
const float32x2_t w7 = c_mul_neon(w6, w);
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -908,7 +1006,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_UNUSED(config);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -917,11 +1015,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
{
ARM_COMPUTE_UNUSED(config);
- if(output != nullptr)
+ if (output != nullptr)
{
auto_init_if_empty(*output, *input);
}
@@ -942,7 +1041,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo
// FFT table axis 0: [radix, first_stage]
static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
- if(fft_table_axis0.empty())
+ if (fft_table_axis0.empty())
{
fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>;
fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>;
@@ -967,7 +1066,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo
// FFT table axis 1: [radix, first_stage]
static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
- if(fft_table_axis1.empty())
+ if (fft_table_axis1.empty())
{
fft_table_axis1[2] = &fft_radix_2_axes_1;
fft_table_axis1[3] = &fft_radix_3_axes_1;
@@ -985,12 +1084,13 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
// Output auto inizialitation if not yet initialized
- if(output != nullptr)
+ if (output != nullptr)
{
auto_init_if_empty(*output->info(), *input->info()->clone());
}
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
_input = input;
_output = (output == nullptr) ? input : output;
@@ -998,7 +1098,7 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
_axis = config.axis;
_radix = config.radix;
- switch(config.axis)
+ switch (config.axis)
{
case 0:
set_radix_stage_axis0(config);
@@ -1012,26 +1112,28 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
}
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
+ auto win_config =
+ validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
}
-Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status NEFFTRadixStageKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const FFTRadixStageKernelInfo &config)
{
const bool run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
- (run_in_place) ? nullptr : output->clone().get(),
- config)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+ .first);
return Status{};
}
std::set<unsigned int> NEFFTRadixStageKernel::supported_radix()
{
- return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+ return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
}
void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
@@ -1049,28 +1151,32 @@ void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
// Precompute FFT constants
const unsigned int NxRadix = _radix * _Nx;
const float alpha = 2.0f * kPi / float(NxRadix);
- const float32x2_t w_m{ cosf(alpha), -sinf(alpha) };
+ const float32x2_t w_m{cosf(alpha), -sinf(alpha)};
- if(_axis == 0)
+ if (_axis == 0)
{
const unsigned int N = _input->info()->dimension(0);
- execute_window_loop(input_window, [&](const Coordinates &)
- {
- _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N);
- },
- in, out);
+ execute_window_loop(
+ input_window,
+ [&](const Coordinates &) {
+ _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m,
+ N);
+ },
+ in, out);
}
else
{
const unsigned int N = _input->info()->dimension(0);
const unsigned int M = _input->info()->dimension(1);
- execute_window_loop(input_window, [&](const Coordinates &)
- {
- _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, M,
- _input->info()->padding().right + _input->info()->padding().left,
- _output->info()->padding().right + _output->info()->padding().left);
- },
- in, out);
+ execute_window_loop(
+ input_window,
+ [&](const Coordinates &)
+ {
+ _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N,
+ M, _input->info()->padding().right + _input->info()->padding().left,
+ _output->info()->padding().right + _output->info()->padding().left);
+ },
+ in, out);
}
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.h b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
index 2291a1068c..54f32efa23 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.h
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/NEON/INEKernel.h"
#include <arm_neon.h>
@@ -92,8 +93,17 @@ private:
void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config);
void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config);
- using FFTFunctionPointerAxis0 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
- using FFTFunctionPointerAxis1 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int, unsigned int, unsigned int, unsigned int)>;
+ using FFTFunctionPointerAxis0 =
+ std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
+ using FFTFunctionPointerAxis1 = std::function<void(float *,
+ float *,
+ unsigned int,
+ unsigned int,
+ const float32x2_t &,
+ unsigned int,
+ unsigned int,
+ unsigned int,
+ unsigned int)>;
FFTFunctionPointerAxis0 _func_0;
FFTFunctionPointerAxis1 _func_1;
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
index 5ec330bebc..9fe561fc59 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
@@ -28,9 +28,10 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
@@ -41,8 +42,8 @@ namespace
void scale_complex(float *c_in, float *c_out, bool is_conjugate, float scale)
{
const auto a = wrapper::vload(c_in);
- auto b = wrapper::vdiv(a, float32x2_t{ scale, scale });
- if(is_conjugate)
+ auto b = wrapper::vdiv(a, float32x2_t{scale, scale});
+ if (is_conjugate)
{
const float img_part = wrapper::vgetlane(b, 1);
b = wrapper::vsetlane(-img_part, b, 1);
@@ -56,7 +57,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -71,7 +72,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
// Configure kernel window
Window win = calculate_max_window(*input, Steps());
- if(output != nullptr)
+ if (output != nullptr)
{
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, *input->clone());
@@ -126,10 +127,10 @@ void NEFFTScaleKernel::run(const Window &window, const ThreadInfo &info)
Iterator in(_input, input_window);
Iterator out(_run_in_place ? _input : _output, input_window);
- execute_window_loop(window, [&](const Coordinates &)
- {
- scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale);
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
+ { scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale); },
+ in, out);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.h b/src/core/NEON/kernels/NEFFTScaleKernel.h
index 24a19f98ba..608cf5ea34 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.h
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H
#define ARM_COMPUTE_NEFFTSCALEKERNEL_H
-#include "src/core/NEON/INEKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/NEON/INEKernel.h"
+
namespace arm_compute
{
// Forward declarations
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 1c7c1f9763..00b0c0ae8d 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -30,14 +30,19 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
namespace arm_compute
{
namespace
{
-inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
+inline void fill_constant_value_single_channel_special(ITensor *tensor,
+ const Window &window,
+ unsigned int right,
+ unsigned int bottom,
+ const PixelValue &constant_border_value)
{
float border_value;
constant_border_value.get(border_value);
@@ -52,39 +57,43 @@ inline void fill_constant_value_single_channel_special(ITensor *tensor, const Wi
Iterator vertical_it(tensor, vertical);
- execute_window_loop(vertical, [&](const Coordinates &)
- {
- const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
+ execute_window_loop(
+ vertical,
+ [&](const Coordinates &)
+ {
+ const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
- // Fill left and right borders
- *(row_start - 1) = border_value;
- std::fill_n(row_start + width, right, border_value);
- },
- vertical_it);
+ // Fill left and right borders
+ *(row_start - 1) = border_value;
+ std::fill_n(row_start + width, right, border_value);
+ },
+ vertical_it);
// Top and bottom border
Iterator plane_it(tensor, window);
// Iterate over all XY planes
- execute_window_loop(window, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + plane_it.offset();
- // Top border
- const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
- // Fill top rows including left/right borders
- std::fill_n(row_start - 1, 1 + width + right, border_value);
-
- // Bottom border
- const unsigned low_border_size = height + bottom;
- for(unsigned int i = height; i < low_border_size; ++i)
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
-
- // Fill bottom rows including left/right borders
+ uint8_t *base_addr = start_valid_region + plane_it.offset();
+ // Top border
+ const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
+ // Fill top rows including left/right borders
std::fill_n(row_start - 1, 1 + width + right, border_value);
- }
- },
- plane_it);
+
+ // Bottom border
+ const unsigned low_border_size = height + bottom;
+ for (unsigned int i = height; i < low_border_size; ++i)
+ {
+ const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
+
+ // Fill bottom rows including left/right borders
+ std::fill_n(row_start - 1, 1 + width + right, border_value);
+ }
+ },
+ plane_it);
}
} // namespace
@@ -93,14 +102,20 @@ NEFillBorderKernel::NEFillBorderKernel()
{
}
-void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
_tensor = tensor;
configure(tensor->info(), border_size, border_mode, constant_border_value);
}
-void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensorInfo *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -124,7 +139,7 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_UNUSED(info);
// If there is no border: early exit
- if(_border_size.empty())
+ if (_border_size.empty())
{
return;
}
@@ -132,13 +147,14 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- switch(_mode)
+ switch (_mode)
{
case BorderMode::CONSTANT:
{
- if(_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
+ if (_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
{
- fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+ fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom,
+ _constant_border_value);
}
else
{
@@ -176,46 +192,56 @@ void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
Iterator vertical_it(_tensor, vertical);
- execute_window_loop(vertical, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + vertical_it.offset();
- // Fill left and right borders
- for(unsigned int i = 0; i < _border_size.left; ++i)
+ execute_window_loop(
+ vertical,
+ [&](const Coordinates &)
{
- std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(), element_size);
- }
+ uint8_t *base_addr = start_valid_region + vertical_it.offset();
+ // Fill left and right borders
+ for (unsigned int i = 0; i < _border_size.left; ++i)
+ {
+ std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(),
+ element_size);
+ }
- for(unsigned int i = 0; i < _border_size.right; ++i)
- {
- std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, element_size);
- }
- },
- vertical_it);
+ for (unsigned int i = 0; i < _border_size.right; ++i)
+ {
+ std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size,
+ element_size);
+ }
+ },
+ vertical_it);
// Top and bottom border
Iterator plane_it(_tensor, window);
// Iterate over all XY planes
- execute_window_loop(window, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + plane_it.offset();
- // Top border
- for(int i = -_border_size.top; i < 0; ++i)
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- // Copy top rows including left/right borders
- std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) - _border_size.left * element_size,
- base_addr - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
- }
+ uint8_t *base_addr = start_valid_region + plane_it.offset();
+ // Top border
+ for (int i = -_border_size.top; i < 0; ++i)
+ {
+ // Copy top rows including left/right borders
+ std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) -
+ _border_size.left * element_size,
+ base_addr - _border_size.left * element_size,
+ (_border_size.left + width + _border_size.right) * element_size);
+ }
- // Bottom border
- for(unsigned int i = height; i < height + _border_size.bottom; ++i)
- {
- // Copy bottom rows including left/right borders
- std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
- base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
- }
- },
- plane_it);
+ // Bottom border
+ for (unsigned int i = height; i < height + _border_size.bottom; ++i)
+ {
+ // Copy bottom rows including left/right borders
+ std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
+ base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] -
+ _border_size.left * element_size,
+ (_border_size.left + width + _border_size.right) * element_size);
+ }
+ },
+ plane_it);
}
void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
@@ -232,50 +258,57 @@ void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window
Iterator vertical_it(_tensor, vertical);
- execute_window_loop(vertical, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + vertical_it.offset();
- // Fill left and right borders
- for(unsigned int i = 0; i < _border_size.left; ++i)
+ execute_window_loop(
+ vertical,
+ [&](const Coordinates &)
{
- std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value, element_size);
- }
+ uint8_t *base_addr = start_valid_region + vertical_it.offset();
+ // Fill left and right borders
+ for (unsigned int i = 0; i < _border_size.left; ++i)
+ {
+ std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value,
+ element_size);
+ }
- for(unsigned int i = 0; i < _border_size.right; ++i)
- {
- std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
- }
- },
- vertical_it);
+ for (unsigned int i = 0; i < _border_size.right; ++i)
+ {
+ std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
+ }
+ },
+ vertical_it);
// Top and bottom border
Iterator plane_it(_tensor, window);
// Iterate over all XY planes
- execute_window_loop(window, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + plane_it.offset();
- // Top border
- for(int i = -_border_size.top; i < 0; ++i)
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- // Fill top rows including left/right borders
- for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+ uint8_t *base_addr = start_valid_region + plane_it.offset();
+ // Top border
+ for (int i = -_border_size.top; i < 0; ++i)
{
- std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+ // Fill top rows including left/right borders
+ for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+ {
+ std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+ &_constant_border_value, element_size);
+ }
}
- }
- // Bottom border
- const unsigned low_border_size = height + _border_size.bottom;
- for(unsigned int i = height; i < low_border_size; ++i)
- {
- // Fill bottom rows including left/right borders
- for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+ // Bottom border
+ const unsigned low_border_size = height + _border_size.bottom;
+ for (unsigned int i = height; i < low_border_size; ++i)
{
- std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+ // Fill bottom rows including left/right borders
+ for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+ {
+ std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+ &_constant_border_value, element_size);
+ }
}
- }
- },
- plane_it);
+ },
+ plane_it);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h
index 2c851583ed..aaad108bfa 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.h
+++ b/src/core/NEON/kernels/NEFillBorderKernel.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -64,7 +65,10 @@ public:
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*
*/
- void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ void configure(ITensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
/** Initialise the function.
*
* @note This kernel fills the borders within the XY-planes.
@@ -75,7 +79,10 @@ public:
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*
*/
- void configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ void configure(ITensorInfo *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index 51a69046a9..cbe5136fb1 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "src/cpu/kernels/fuse_batch_normalization/list.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
@@ -30,12 +29,14 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/common/cpuinfo/CpuIsaInfo.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/fuse_batch_normalization/list.h"
#include <map>
@@ -52,8 +53,16 @@ struct FuseBatchNormalizeSelectorData
};
using FBNSelectorPtr = std::add_pointer<bool(const FuseBatchNormalizeSelectorData &data)>::type;
-using FBNUKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, ITensor *,
- const ITensor *, const ITensor *, const ITensor *, const ITensor *, float, const Window &)>::type;
+using FBNUKernelPtr = std::add_pointer<void(const ITensor *,
+ const ITensor *,
+ ITensor *,
+ ITensor *,
+ const ITensor *,
+ const ITensor *,
+ const ITensor *,
+ const ITensor *,
+ float,
+ const Window &)>::type;
struct FBNUKernel
{
@@ -62,73 +71,63 @@ struct FBNUKernel
FBNUKernelPtr ukernel;
};
-static const FBNUKernel available_kernels[] =
-{
- {
- "fused_batch_normalization_conv_NHWC_F16",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
- },
- REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)
- },
- {
- "fused_batch_normalization_conv_NCHW_F16",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
- },
- REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)
- },
- {
- "fused_batch_normalization_dwc_NHWC_F16",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
- },
- REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)
- },
- {
- "fused_batch_normalization_dwc_NCHW_F16",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
- },
- REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)
- },
- {
- "fused_batch_normalization_conv_NHWC_F32",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
- },
- REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)
- },
- {
- "fused_batch_normalization_conv_NCHW_F32",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
- },
- REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)
- },
- {
- "fused_batch_normalization_dwc_NHWC_F32",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
- },
- REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)
- },
- {
- "fused_batch_normalization_dwc_NCHW_F32",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
- },
- REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)
- }
-};
+static const FBNUKernel available_kernels[] = {
+ {"fused_batch_normalization_conv_NHWC_F16",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+ data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+ },
+ REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+ {"fused_batch_normalization_conv_NCHW_F16",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+ data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+ },
+ REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+ {"fused_batch_normalization_dwc_NHWC_F16",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+ data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+ },
+ REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)},
+ {"fused_batch_normalization_dwc_NCHW_F16",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+ data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+ },
+ REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)},
+ {"fused_batch_normalization_conv_NHWC_F32",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+ data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+ },
+ REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+ {"fused_batch_normalization_conv_NCHW_F32",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+ data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+ },
+ REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+ {"fused_batch_normalization_dwc_NHWC_F32",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+ data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+ },
+ REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)},
+ {"fused_batch_normalization_dwc_NCHW_F32",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+ data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+ },
+ REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)}};
/** Micro-kernel selector
*
@@ -140,9 +139,9 @@ static const FBNUKernel available_kernels[] =
*/
const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -150,10 +149,16 @@ const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data)
return nullptr;
}
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -164,43 +169,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
- if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+ if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
{
ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
}
else
{
- const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t channel_idx =
+ get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
}
// Validate bias
- if(input_bias != nullptr)
+ if (input_bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
}
// Validate beta
- if(bn_beta != nullptr)
+ if (bn_beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
}
// Validate gamma
- if(bn_gamma != nullptr)
+ if (bn_gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
}
// Validate output weights
- if(fused_weights != nullptr && fused_weights->total_size() != 0)
+ if (fused_weights != nullptr && fused_weights->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
}
// Validate output bias
- if(fused_bias != nullptr && fused_bias->total_size() != 0)
+ if (fused_bias != nullptr && fused_bias->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -212,15 +218,31 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
} // namespace
NEFuseBatchNormalizationKernel::NEFuseBatchNormalizationKernel()
- : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
- _run_in_place_weights(false), _run_in_place_bias(false), _func(nullptr)
+ : _input_weights(nullptr),
+ _input_bias(nullptr),
+ _bn_mean(nullptr),
+ _bn_var(nullptr),
+ _bn_gamma(nullptr),
+ _bn_beta(nullptr),
+ _fused_weights(nullptr),
+ _fused_bias(nullptr),
+ _epsilon(),
+ _run_in_place_weights(false),
+ _run_in_place_bias(false),
+ _func(nullptr)
{
}
-void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
- ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *input_bias,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -238,27 +260,27 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con
_run_in_place_bias = (fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
// Auto initialize outputs
- if(_fused_weights != nullptr)
+ if (_fused_weights != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
}
- if(_fused_bias != nullptr)
+ if (_fused_bias != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
}
// Validate arguments
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
- (fused_weights != nullptr) ? fused_weights->info() : nullptr,
- (fused_bias != nullptr) ? fused_bias->info() : nullptr,
- (input_bias != nullptr) ? input_bias->info() : nullptr,
- (bn_beta != nullptr) ? bn_beta->info() : nullptr,
- (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
- epsilon, fbn_type));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+ input_weights->info(), bn_mean->info(), bn_var->info(),
+ (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+ (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+ (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+ fbn_type));
- const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{ input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa() });
+ const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{
+ input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
_func = uk->ukernel;
@@ -268,12 +290,19 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con
INEKernel::configure(win);
}
-Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
return Status{};
}
@@ -284,6 +313,7 @@ void NEFuseBatchNormalizationKernel::run(const Window &window, const ThreadInfo
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, window);
+ (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon,
+ window);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
index ee767b01c8..f23280d55a 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
@@ -66,9 +66,16 @@ public:
* @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
* @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
*/
- void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ void configure(const ITensor *input_weights,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *input_bias = nullptr,
+ const ITensor *bn_beta = nullptr,
+ const ITensor *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
/** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel
*
* @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -86,10 +93,16 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ static Status validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias = nullptr,
+ const ITensorInfo *bn_beta = nullptr,
+ const ITensorInfo *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
@@ -107,8 +120,16 @@ private:
bool _run_in_place_weights;
bool _run_in_place_bias;
- using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
+ using FuseBatchNormFunction = void(const ITensor *input_weights,
+ const ITensor *input_bias,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ const Window &window);
FuseBatchNormFunction *_func;
};
diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
index 11332ffac8..f1d457d399 100644
--- a/src/core/NEON/kernels/NEGatherKernel.cpp
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp
@@ -27,9 +27,10 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -42,20 +43,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices,
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
- if(axis < 0)
+ if (axis < 0)
{
axis += input->num_dimensions();
}
ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > Coordinates::num_max_dimensions);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 >
+ Coordinates::num_max_dimensions);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), axis);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+ input->tensor_shape(), indices->tensor_shape(), axis);
ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
}
@@ -81,23 +84,23 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info)
const auto idx_info = _indices->info();
const auto dst_info = _output->info();
- const auto num_dims = dst_info->num_dimensions();
+ const auto num_dims = dst_info->num_dimensions();
const auto chunk_stride = src_info->strides_in_bytes()[_axis];
const auto window_start_x = window.x().start();
- const auto window_end_x = window.x().end();
- auto window_size_x = src_info->element_size();
+ const auto window_end_x = window.x().end();
+ auto window_size_x = src_info->element_size();
const auto idx_limit = static_cast<TIndex>(src_info->tensor_shape()[_axis]);
- if(_axis != 0)
+ if (_axis != 0)
{
dst_win.set(0, Window::Dimension(window_start_x, window_start_x + 1, 1));
window_size_x *= window_end_x - window_start_x;
}
// Compute source and index tensors window based on the output window.
- auto src_win = dst_win;
+ auto src_win = dst_win;
Window idx_win;
for (size_t i = 0; i < idx_info->num_dimensions(); ++i)
@@ -109,22 +112,27 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info)
// Use the custom strides to access all three tensors using the same loop.
Iterator src_it(num_dims, _src_it_strides, _input->buffer(), src_info->offset_first_element_in_bytes(), src_win);
Iterator idx_it(num_dims, _idx_it_strides, _indices->buffer(), idx_info->offset_first_element_in_bytes(), idx_win);
- Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(), dst_info->offset_first_element_in_bytes(), dst_win);
-
- execute_window_loop(dst_win, [&](const Coordinates &) {
- const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr());
-
- if(idx >= 0 && idx < idx_limit)
- {
- const auto src_ptr = src_it.ptr() + idx * chunk_stride;
+ Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(),
+ dst_info->offset_first_element_in_bytes(), dst_win);
- std::copy_n(src_ptr, window_size_x, dst_it.ptr());
- }
- else
+ execute_window_loop(
+ dst_win,
+ [&](const Coordinates &)
{
- std::fill_n(dst_it.ptr(), window_size_x, 0);
- }
- }, src_it, idx_it, dst_it);
+ const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr());
+
+ if (idx >= 0 && idx < idx_limit)
+ {
+ const auto src_ptr = src_it.ptr() + idx * chunk_stride;
+
+ std::copy_n(src_ptr, window_size_x, dst_it.ptr());
+ }
+ else
+ {
+ std::fill_n(dst_it.ptr(), window_size_x, 0);
+ }
+ },
+ src_it, idx_it, dst_it);
}
void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
@@ -137,13 +145,13 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
_output = output;
_axis = axis;
- if(_axis < 0)
+ if (_axis < 0)
{
_axis += input->info()->num_dimensions();
}
ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
- switch(_indices->info()->data_type())
+ switch (_indices->info()->data_type())
{
case DataType::U32:
_func = &NEGatherKernel::gather_common<uint32_t>;
@@ -157,7 +165,8 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
}
// Output auto initialization if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+ input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
// Create window
@@ -169,30 +178,31 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
// These will be used to iterate lock-step through all tensors (input, indices and output).
size_t dim_no = 0;
- const auto input_info = input->info();
+ const auto input_info = input->info();
const auto &input_strides = input_info->strides_in_bytes();
- const auto indices_info = indices->info();
- const auto &indices_strides = indices_info->strides_in_bytes();
- const auto indices_num_dims = indices_info->num_dimensions();
+ const auto indices_info = indices->info();
+ const auto &indices_strides = indices_info->strides_in_bytes();
+ const auto indices_num_dims = indices_info->num_dimensions();
- for(; dim_no < static_cast<size_t>(_axis); ++dim_no)
+ for (; dim_no < static_cast<size_t>(_axis); ++dim_no)
{
_src_it_strides[dim_no] = input_strides[dim_no];
}
- for(; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no)
+ for (; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no)
{
_idx_it_strides[dim_no] = indices_strides[dim_no - _axis];
}
- for(; dim_no < Coordinates::num_max_dimensions; ++dim_no)
+ for (; dim_no < Coordinates::num_max_dimensions; ++dim_no)
{
_src_it_strides[dim_no] = input_strides[dim_no - indices_num_dims + 1];
}
}
-Status NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
return Status{};
diff --git a/src/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h
index ce69daeda7..b8c069f99e 100644
--- a/src/core/NEON/kernels/NEGatherKernel.h
+++ b/src/core/NEON/kernels/NEGatherKernel.h
@@ -26,6 +26,7 @@
#define ARM_COMPUTE_NEGATHERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -92,8 +93,8 @@ private:
ITensor *_output;
kernel_ptr _func;
- Strides _src_it_strides;
- Strides _idx_it_strides;
+ Strides _src_it_strides;
+ Strides _idx_it_strides;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEGATHERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 7bba136e84..549319e49f 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -27,11 +27,13 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/genproposals/list.h"
+
#include <arm_neon.h>
namespace arm_compute
@@ -44,7 +46,8 @@ struct ComputeAllAnchorsData
};
using ComputeAllAnchorsSelectorPtr = std::add_pointer<bool(const ComputeAllAnchorsData &data)>::type;
-using ComputeAllAnchorsUKernelPtr = std::add_pointer<void(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
+using ComputeAllAnchorsUKernelPtr = std::add_pointer<void(
+ const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
struct ComputeAllAnchorsKernel
{
@@ -53,27 +56,17 @@ struct ComputeAllAnchorsKernel
ComputeAllAnchorsUKernelPtr ukernel;
};
-static const ComputeAllAnchorsKernel available_kernels[] =
-{
+static const ComputeAllAnchorsKernel available_kernels[] = {
#if defined(ARM_COMPUTE_ENABLE_NEON)
- {
- "neon_qu16_computeallanchors",
- [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; },
- REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)
- },
+ {"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; },
+ REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)},
#endif //defined(ARM_COMPUTE_ENABLE_NEON)
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "neon_fp16_computeallanchors",
- [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)
- },
+ {"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)},
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "neon_fp32_computeallanchors",
- [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)
- },
+ {"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)},
};
/** Micro-kernel selector
@@ -84,9 +77,9 @@ static const ComputeAllAnchorsKernel available_kernels[] =
*/
const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -101,7 +94,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
- if(all_anchors->total_size() > 0)
+ if (all_anchors->total_size() > 0)
{
const size_t feature_height = info.feat_height();
const size_t feature_width = info.feat_width();
@@ -111,7 +104,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
- if(is_data_type_quantized(anchors->data_type()))
+ if (is_data_type_quantized(anchors->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
}
@@ -139,7 +132,8 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a
// Initialize the output if empty
const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
- auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+ auto_init_if_empty(*all_anchors->info(),
+ TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
// Set instance variables
_anchors = anchors;
@@ -151,7 +145,9 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a
INEKernel::configure(win);
}
-Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors,
+ const ITensorInfo *all_anchors,
+ const ComputeAnchorsInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
return Status{};
@@ -163,7 +159,7 @@ void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() });
+ const auto *uk = get_implementation(ComputeAllAnchorsData{_anchors->info()->data_type()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
uk->ukernel(_anchors, _all_anchors, _anchors_info, window);
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
index 297d6d4abe..30699eee01 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
@@ -78,5 +78,5 @@ private:
ITensor *_all_anchors;
ComputeAnchorsInfo _anchors_info;
};
-} // arm_compute
+} // namespace arm_compute
#endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index 71641404bf..0a1780f6ee 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -31,12 +31,13 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/instancenorm/list.h"
#include <arm_neon.h>
@@ -51,7 +52,13 @@ struct InstanceNormSelectorData
};
using InstanceNormSelctorPtr = std::add_pointer<bool(const InstanceNormSelectorData &data)>::type;
-using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)>::type;
+using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input,
+ ITensor *output,
+ float gamma,
+ float beta,
+ float epsilon,
+ bool use_mixed_precision,
+ const Window &window)>::type;
struct InstanceNormKernel
{
@@ -60,19 +67,12 @@ struct InstanceNormKernel
InstanceNormUKernelPtr ukernel;
};
-static const InstanceNormKernel available_kernels[] =
-{
- {
- "fp32_neon_instancenorm",
- [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)
- },
+static const InstanceNormKernel available_kernels[] = {
+ {"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)},
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "fp16_neon_instancenorm",
- [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)
- },
+ {"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)},
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
};
@@ -84,9 +84,9 @@ static const InstanceNormKernel available_kernels[] =
*/
const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -102,14 +102,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "NHWC data layout is not supported by the kernel directly");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC,
+ "NHWC data layout is not supported by the kernel directly");
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+ "Input and output have different number of channels");
}
return Status{};
}
@@ -132,7 +134,9 @@ NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel()
{
}
-void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void NEInstanceNormalizationLayerKernel::configure(ITensor *input,
+ ITensor *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
@@ -152,10 +156,13 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp
INEKernel::configure(std::get<1>(win_config));
}
-Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info.gamma, info.beta, info.epsilon));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+ input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
return Status{};
}
@@ -165,7 +172,7 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() });
+ const auto *uk = get_implementation(InstanceNormSelectorData{_input->info()->data_type()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window);
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
index f166ce2058..024ccd9ef2 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
@@ -68,7 +68,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
@@ -82,14 +83,15 @@ private:
* @param[in] beta The offset scalar value applied to the normalized tensor. Defaults to 0.0
* @param[in] epsilon Lower bound value for the normalization. Defaults to 1e-12
*/
- using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+ using NormalizationFunction =
+ void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
ITensor *_input;
ITensor *_output;
float _gamma;
float _beta;
float _epsilon;
- bool _use_mixed_precision{ true };
+ bool _use_mixed_precision{true};
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 8ab0288ab1..eea57a17d3 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -30,11 +30,12 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/common/cpuinfo/CpuIsaInfo.h"
-#include "src/core/NEON/NEMath.h"
#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
#include "src/cpu/kernels/l2normlayer/list.h"
#include <arm_neon.h>
@@ -55,7 +56,8 @@ struct L2NormalizeLayerSelectorData
using L2NormalizeLayerKernelSelctorPtr = std::add_pointer<bool(const L2NormalizeLayerSelectorData &data)>::type;
-using L2NormalizeLayerPtr = std::add_pointer<void(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type;
+using L2NormalizeLayerPtr = std::add_pointer<void(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type;
struct L2NormalizeLayerKernel
{
@@ -64,26 +66,25 @@ struct L2NormalizeLayerKernel
L2NormalizeLayerPtr ukernel;
};
-static const L2NormalizeLayerKernel available_kernels[] =
-{
- {
- "fp32_neon_l2normalize_x",
- [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)
- },
- {
- "fp32_neon_l2normalize_yz",
- [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)
- },
+static const L2NormalizeLayerKernel available_kernels[] = {
+ {"fp32_neon_l2normalize_x",
+ [](const L2NormalizeLayerSelectorData &data)
+ { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)},
+ {"fp32_neon_l2normalize_yz",
+ [](const L2NormalizeLayerSelectorData &data)
+ { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)},
{
"fp16_neon_l2normalize_x",
- [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; },
+ [](const L2NormalizeLayerSelectorData &data)
+ { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; },
REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_x),
},
{
"fp16_neon_l2normalize_yz",
- [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; },
+ [](const L2NormalizeLayerSelectorData &data)
+ { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; },
REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_yz),
},
};
@@ -96,9 +97,9 @@ static const L2NormalizeLayerKernel available_kernels[] =
*/
const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -106,7 +107,8 @@ const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorD
return nullptr;
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
{
ARM_COMPUTE_UNUSED(epsilon);
@@ -115,14 +117,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+ "Actual normalization axis greater than max number of dimensions");
// Reduce shape on axis
TensorShape sum_shape = input->tensor_shape();
sum_shape.set(actual_axis, 1);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -151,7 +154,8 @@ NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel()
{
}
-void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
+void NEL2NormalizeLayerKernel::configure(
+ const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
@@ -169,10 +173,12 @@ void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *su
INEKernel::configure(std::get<1>(win_config));
}
-Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status NEL2NormalizeLayerKernel::validate(
+ const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
return Status{};
}
@@ -183,12 +189,13 @@ void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- if(_actual_axis > 2)
+ if (_actual_axis > 2)
{
ARM_COMPUTE_ERROR("Unsupported normalization axis");
}
- const auto *uk = get_implementation(L2NormalizeLayerSelectorData{ _output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa() });
+ const auto *uk = get_implementation(
+ L2NormalizeLayerSelectorData{_output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON(uk == nullptr);
ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
index af3ad3403e..3524e66a21 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
@@ -74,7 +74,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NELogicalKernel.cpp b/src/core/NEON/kernels/NELogicalKernel.cpp
index 6939e08ef0..6be6284528 100644
--- a/src/core/NEON/kernels/NELogicalKernel.cpp
+++ b/src/core/NEON/kernels/NELogicalKernel.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
+
#include "src/common/utils/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -50,7 +51,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
src0 += step;
@@ -58,7 +59,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
dst += step;
}
- for(; len >= half_step; len -= half_step)
+ for (; len >= half_step; len -= half_step)
{
vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
src0 += half_step;
@@ -66,7 +67,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
dst += half_step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*dst = (*src0) && (*src1);
++src0;
@@ -84,21 +85,21 @@ void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8
const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
src += step;
dst += step;
}
- for(; len >= half_step; len -= half_step)
+ for (; len >= half_step; len -= half_step)
{
vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
src += half_step;
dst += half_step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*dst = (*src) && broadcast_val_clamped_s;
++src;
@@ -112,7 +113,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
src0 += step;
@@ -120,7 +121,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
dst += step;
}
- for(; len >= half_step; len -= half_step)
+ for (; len >= half_step; len -= half_step)
{
vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
src0 += half_step;
@@ -128,7 +129,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
dst += half_step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*dst = (*src0) || (*src1);
++src0;
@@ -146,21 +147,21 @@ void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_
const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
src += step;
dst += step;
}
- for(; len >= half_step; len -= half_step)
+ for (; len >= half_step; len -= half_step)
{
vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
src += half_step;
dst += half_step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*dst = (*src) || broadcast_val_clamped_s;
++src;
@@ -173,21 +174,21 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len)
ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16));
src += step;
dst += step;
}
- for(; len >= half_step; len -= half_step)
+ for (; len >= half_step; len -= half_step)
{
vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8));
src += half_step;
dst += half_step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*dst = !(*src);
++src;
@@ -197,18 +198,15 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len)
void run_unary(const Window &window, const ITensor *src, ITensor *dst)
{
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
const auto len = window.x().end() - window.x().start();
Iterator in(src, win);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- neon_logical_not(in.ptr(), out.ptr(), len);
- },
- in, out);
+ execute_window_loop(
+ win, [&](const Coordinates &) { neon_logical_not(in.ptr(), out.ptr(), len); }, in, out);
}
void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op)
@@ -216,16 +214,17 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
const auto len = window.x().end() - window.x().start();
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
- using LogicalBroadcastUKernelPtr = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type;
- LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
+ using LogicalBroadcastUKernelPtr = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type;
+ LogicalBroadcastUKernelPtr logical_func =
+ op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
const bool is_broadcast_input_1 = src1_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_1 ? src1_win : src0_win;
@@ -238,17 +237,18 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const uint8_t broadcast_value = *broadcast_in.ptr();
- logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
-
- },
- broadcast_in, non_broadcast_in, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const uint8_t broadcast_value = *broadcast_in.ptr();
+ logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
+ },
+ broadcast_in, non_broadcast_in, out);
}
else
{
- using LogicalUKernelPtr = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type;
+ using LogicalUKernelPtr = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type;
LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and;
src0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -257,11 +257,8 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
Iterator in0(src0, src0_win);
Iterator in1(src1, src1_win);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- logical_func(in0.ptr(), in1.ptr(), out.ptr(), len);
- },
- in0, in1, out);
+ execute_window_loop(
+ win, [&](const Coordinates &) { logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); }, in0, in1, out);
}
}
} // namespace
@@ -270,7 +267,10 @@ const char *NELogicalKernel::name() const
return "NELogicalKernel";
}
-void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op)
+void NELogicalKernel::configure(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ ITensorInfo *output,
+ LogicalOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output);
ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op));
@@ -279,7 +279,7 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in
Window win = calculate_max_window(*input1, Steps());
TensorShape out_shape = input1->tensor_shape();
- if(op != LogicalOperation::Not)
+ if (op != LogicalOperation::Not)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
@@ -292,13 +292,16 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in
set_data_type_if_unknown(*output, input1->data_type());
}
-Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op)
+Status NELogicalKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ LogicalOperation op)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
ARM_COMPUTE_RETURN_ERROR_ON(op == LogicalOperation::Unknown);
TensorShape out_shape = input1->tensor_shape();
- if(op != LogicalOperation::Not)
+ if (op != LogicalOperation::Not)
{
out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -306,7 +309,7 @@ Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *i
}
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -326,7 +329,7 @@ void NELogicalKernel::run_op(ITensorPack &tensors, const Window &window, const T
const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
- if(_op == LogicalOperation::Not)
+ if (_op == LogicalOperation::Not)
{
run_unary(window, src0, dst);
}
diff --git a/src/core/NEON/kernels/NELogicalKernel.h b/src/core/NEON/kernels/NELogicalKernel.h
index caf69cf45d..477a59d826 100644
--- a/src/core/NEON/kernels/NELogicalKernel.h
+++ b/src/core/NEON/kernels/NELogicalKernel.h
@@ -58,10 +58,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
+ static Status
+ validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index 37e88a8565..451031d696 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -28,12 +28,13 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/meanstddevnorm/list.h"
namespace arm_compute
@@ -46,7 +47,8 @@ struct MeanStdDevNormSelectorData
};
using MeanStdDevNormSelctorPtr = std::add_pointer<bool(const MeanStdDevNormSelectorData &data)>::type;
-using MeanStdDevNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
+using MeanStdDevNormUKernelPtr =
+ std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
struct MeanStdDevNormKernel
{
@@ -55,25 +57,15 @@ struct MeanStdDevNormKernel
MeanStdDevNormUKernelPtr ukernel;
};
-static const std::vector<MeanStdDevNormKernel> available_kernels =
-{
- {
- "fp32_neon_meanstddevnorm",
- [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)
- },
+static const std::vector<MeanStdDevNormKernel> available_kernels = {
+ {"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)},
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "fp16_neon_meanstddevnorm",
- [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)
- },
+ {"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)},
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "qasymm8_neon_meanstddevnorm",
- [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)
- },
+ {"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)},
};
/** Micro-kernel selector
@@ -84,9 +76,9 @@ static const std::vector<MeanStdDevNormKernel> available_kernels =
*/
const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -103,7 +95,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -113,7 +105,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
{
- if(output != nullptr)
+ if (output != nullptr)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Output auto inizialitation if not yet initialized
@@ -128,8 +120,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
}
} // namespace
-NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel()
- : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
+NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
{
}
@@ -137,7 +128,8 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+ ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(
+ input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
_input = input;
_output = (output == nullptr) ? input : output;
@@ -152,7 +144,9 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr)
+ .first);
return Status{};
}
@@ -162,7 +156,7 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() });
+ const auto *uk = get_implementation(MeanStdDevNormSelectorData{_output->info()->data_type()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
uk->ukernel(_input, _output, _epsilon, window);
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 49a045382d..2c61bda147 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -29,19 +29,23 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/NormalizationHelpers.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *input_squared,
+ const ITensorInfo *output,
+ const NormalizationLayerInfo &norm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
@@ -52,7 +56,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squ
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -69,7 +73,10 @@ NENormalizationLayerKernel::NENormalizationLayerKernel()
{
}
-void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
+void NENormalizationLayerKernel::configure(const ITensor *input,
+ const ITensor *input_squared,
+ ITensor *output,
+ NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output);
// Output tensor auto initialization if not yet initialized
@@ -85,15 +92,15 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
_output = output;
_norm_info = norm_info;
- switch(_input->info()->data_type())
+ switch (_input->info()->data_type())
{
case DataType::F32:
{
- switch(norm_idx)
+ switch (norm_idx)
{
case 0:
{
- if(norm_info.type() == NormType::IN_MAP_2D)
+ if (norm_info.type() == NormType::IN_MAP_2D)
{
_func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, true>;
}
@@ -104,7 +111,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
break;
}
case 1:
- if(norm_info.type() == NormType::IN_MAP_2D)
+ if (norm_info.type() == NormType::IN_MAP_2D)
{
_func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, true>;
}
@@ -124,11 +131,11 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
- switch(norm_idx)
+ switch (norm_idx)
{
case 0:
{
- if(norm_info.type() == NormType::IN_MAP_2D)
+ if (norm_info.type() == NormType::IN_MAP_2D)
{
_func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, true>;
}
@@ -139,7 +146,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
break;
}
case 1:
- if(norm_info.type() == NormType::IN_MAP_2D)
+ if (norm_info.type() == NormType::IN_MAP_2D)
{
_func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, true>;
}
@@ -196,8 +203,9 @@ void NENormalizationLayerKernel::normalize_float(const Window &window)
const auto beta_vec = wrapper::vdup_n(static_cast<T>(_norm_info.beta()), ExactTagType{});
const auto kappa_vec = wrapper::vdup_n(static_cast<T>(_norm_info.kappa()), ExactTagType{});
- auto sequential_normalization = [&](const int x, const Coordinates & id, const int current_row, const int first_row, const int last_row, const T * input_ptr, const uint8_t *input_squared_start_ptr,
- T * output_ptr)
+ auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row,
+ const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr,
+ T *output_ptr)
{
const int current_slice = dim == 0 ? x : id[dim];
const int first_slice = std::max(current_slice - radius, 0);
@@ -206,75 +214,87 @@ void NENormalizationLayerKernel::normalize_float(const Window &window)
const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
// Accumulate 2D In-Map values
auto accu = static_cast<T>(0.f);
- for(int j = first_row; j <= last_row; ++j)
+ for (int j = first_row; j <= last_row; ++j)
{
// Compute row displacement
const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
- for(int i = first_slice; i <= last_slice; ++i)
+ for (int i = first_slice; i <= last_slice; ++i)
{
- accu += *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
+ accu +=
+ *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
}
}
// Normalize
- const auto normalized = std::pow(accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
+ const auto normalized = std::pow(
+ accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
const auto normalized_pixel = (*(input_ptr + x)) / normalized;
*(output_ptr + x) = normalized_pixel;
};
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- auto output_ptr = reinterpret_cast<T *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
+ {
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ auto output_ptr = reinterpret_cast<T *>(output.ptr());
- // Get range to normalize
- const int current_row = do_2D_norm ? id[dim_y] : 0;
- const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0;
- const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
- int x = window_start_x;
- // Compute serially starting elements for the case x dimension is width
- for(; x < radius && x < window_end_x && dim == 0; ++x)
- {
- sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
- }
+ int x = window_start_x;
+ // Compute serially starting elements for the case x dimension is width
+ for (; x < radius && x < window_end_x && dim == 0; ++x)
+ {
+ sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+ output_ptr);
+ }
- // Compute vectorized
- for(; x <= window_end_x - window_step_x - radius; x += window_step_x)
- {
- const int current_slice = dim == 0 ? x : id[dim];
- const int first_slice = std::max(current_slice - radius, 0);
- const int last_slice = std::min(current_slice + radius, max_right);
-
- const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
- // Accumulate 2D In-Map values
- auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
- for(int j = first_row; j <= last_row; ++j)
+ // Compute vectorized
+ for (; x <= window_end_x - window_step_x - radius; x += window_step_x)
{
- // Compute row displacement
- const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
- for(int i = first_slice; i <= last_slice; ++i)
+ const int current_slice = dim == 0 ? x : id[dim];
+ const int first_slice = std::max(current_slice - radius, 0);
+ const int last_slice = std::min(current_slice + radius, max_right);
+
+ const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
+ // Accumulate 2D In-Map values
+ auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ for (int j = first_row; j <= last_row; ++j)
{
- accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+ // Compute row displacement
+ const uint8_t *const input_squared_ptr =
+ input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
+ for (int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = wrapper::vadd(
+ accu, wrapper::vloadq(reinterpret_cast<const T *>(
+ input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+ }
}
- }
- // Normalize
- const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
- const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
- wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
- }
+ // Normalize
+ const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
+ const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
+ wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
- }
- },
- input, input_squared, output);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+ output_ptr);
+ }
+ },
+ input, input_squared, output);
}
-Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)
+Status NENormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *input_squared,
+ const ITensorInfo *output,
+ const NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h
index 53a06b9ed9..2d8d9f3d60 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -60,7 +60,8 @@ public:
* @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
* @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
*/
- void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
+ void
+ configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
/** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
*
* @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -72,7 +73,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *input_squared,
+ const ITensorInfo *output,
+ NormalizationLayerInfo norm_info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 734510b637..c9bcbc9127 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -28,26 +28,31 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &paddings, const PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &paddings,
+ const PaddingMode mode)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(mode != PaddingMode::CONSTANT, "Only constant padding mode is supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(paddings.size() > 4, "Padding list bigger than 4 dimensions");
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
- const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape);
+ const TensorShape expected_output_shape =
+ arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
+ const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -58,30 +63,34 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
template <typename T>
void NEPadLayerKernel::run_pad_constant(const Window &window)
{
- Window output_window{ window };
+ Window output_window{window};
output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
const size_t element_size = _input->info()->element_size();
Iterator output_it(_output, output_window);
- execute_window_loop(output_window, [&](const Coordinates & id)
- {
- Coordinates idin{ id };
- for(size_t dim = _padding.size() - 1; dim > 0; --dim)
+ execute_window_loop(
+ output_window,
+ [&](const Coordinates &id)
{
- idin[dim] -= _padding[dim].first;
- if(idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+ Coordinates idin{id};
+ for (size_t dim = _padding.size() - 1; dim > 0; --dim)
{
- std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0), _constant_value.get<T>());
- return;
+ idin[dim] -= _padding[dim].first;
+ if (idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+ {
+ std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0),
+ _constant_value.get<T>());
+ return;
+ }
}
- }
- T *input_it_ptr = reinterpret_cast<T *>(_input->ptr_to_element(idin));
- T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
- std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
- memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
- std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, _constant_value.get<T>());
- },
- output_it);
+ T *input_it_ptr = reinterpret_cast<T *>(_input->ptr_to_element(idin));
+ T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
+ std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
+ memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
+ std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second,
+ _constant_value.get<T>());
+ },
+ output_it);
}
void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window)
@@ -92,7 +101,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
const size_t end_plane = window.z().end();
size_t start_plane_input = start_plane;
- if(_padding.size() > 2)
+ if (_padding.size() > 2)
{
start_plane_input = (start_plane < _padding[2].first) ? 0 : start_plane - _padding[2].first;
}
@@ -105,18 +114,20 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
const size_t jump_to_next_row_input = _input->info()->dimension(0);
const size_t jump_to_next_row_output = _padding[0].first + _padding[0].second;
- uint8_t *output_row_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
- const uint8_t *input_it_ptr = _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
- const auto pad_value = _constant_value.get<uint8_t>();
+ uint8_t *output_row_ptr =
+ _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
+ const uint8_t *input_it_ptr =
+ _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
+ const auto pad_value = _constant_value.get<uint8_t>();
- for(size_t z_i = start_plane; z_i < end_plane; ++z_i)
+ for (size_t z_i = start_plane; z_i < end_plane; ++z_i)
{
- if(_padding.size() > 2 && z_i < _padding[2].first)
+ if (_padding.size() > 2 && z_i < _padding[2].first)
{
memset(output_row_ptr, pad_value, output_plane_size);
output_row_ptr += output_plane_size;
}
- else if(_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
+ else if (_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
{
memset(output_row_ptr, pad_value, output_plane_size);
output_row_ptr += output_plane_size;
@@ -127,7 +138,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
output_row_ptr += pad_y_elems_top;
size_t y_i = _input->info()->dimension(1);
// Basic loop unrolling
- for(; y_i > 3; y_i -= 4)
+ for (; y_i > 3; y_i -= 4)
{
memset(output_row_ptr, pad_value, _padding[0].first);
output_row_ptr += _padding[0].first;
@@ -160,7 +171,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
memset(output_row_ptr, pad_value, _padding[0].second);
output_row_ptr += _padding[0].second;
}
- for(; y_i > 0; --y_i)
+ for (; y_i > 0; --y_i)
{
memset(output_row_ptr, pad_value, _padding[0].first);
output_row_ptr += _padding[0].first;
@@ -183,12 +194,17 @@ NEPadLayerKernel::NEPadLayerKernel()
{
}
-void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayerKernel::configure(ITensor *input,
+ ITensor *output,
+ const PaddingList &padding,
+ const PixelValue constant_value,
+ const PaddingMode mode)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Auto-init
- const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
- const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape);
+ const TensorShape expected_output_shape =
+ arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
+ const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape);
auto_init_if_empty(*output->info(), expected_output_info);
// Perform validation step
@@ -200,14 +216,14 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL
_constant_value = constant_value;
_mode = mode;
- if(_mode == PaddingMode::CONSTANT)
+ if (_mode == PaddingMode::CONSTANT)
{
- switch(_input->info()->element_size())
+ switch (_input->info()->element_size())
{
case 1:
- if(_input->info()->num_dimensions() == 3 && // Is 3D
- padding.size() <= 3 && // Has 3D padding
- !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
+ if (_input->info()->num_dimensions() == 3 && // Is 3D
+ padding.size() <= 3 && // Has 3D padding
+ !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
{
_func = &NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad;
}
@@ -240,7 +256,11 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL
ICPPKernel::configure(win);
}
-Status NEPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ const PixelValue constant_value,
+ const PaddingMode mode)
{
ARM_COMPUTE_UNUSED(constant_value);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, mode));
@@ -253,7 +273,7 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- if(_func != nullptr)
+ if (_func != nullptr)
{
(this->*_func)(window);
}
@@ -263,7 +283,7 @@ size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) c
{
ARM_COMPUTE_UNUSED(thread_count);
ARM_COMPUTE_UNUSED(platform);
-
+
return ICPPKernel::default_mws;
}
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h
index f82af1558a..d432887d2c 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.h
+++ b/src/core/NEON/kernels/NEPadLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEPADLAYERKERNEL_H
#include "arm_compute/core/PixelValue.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -62,7 +63,11 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
* Only CONSTANT padding mode is currently supported
*/
- void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+ void configure(ITensor *input,
+ ITensor *output,
+ const PaddingList &padding,
+ const PixelValue constant_value = PixelValue(),
+ const PaddingMode mode = PaddingMode::CONSTANT);
/** Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
*
* @param[in] input Source tensor info. Data types supported: All.
@@ -75,7 +80,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ const PixelValue constant_value = PixelValue(),
+ const PaddingMode mode = PaddingMode::CONSTANT);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 3d89933377..15e933e66e 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -36,7 +37,10 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -45,10 +49,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
// Check variances
const int var_size = info.variances().size();
- if(var_size > 1)
+ if (var_size > 1)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
- for(int i = 0; i < var_size; ++i)
+ for (int i = 0; i < var_size; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
}
@@ -56,17 +60,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
- if(!info.max_sizes().empty())
+ if (!info.max_sizes().empty())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+ "Max and min sizes dimensions should match");
}
- for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+ for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+ "Max size should be greater than min size");
}
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -76,21 +82,26 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
}
} // namespace
-NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
{
}
-void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
- const int height)
+void NEPriorBoxLayerKernel::store_coordinates(float *out,
+ const int offset,
+ const float center_x,
+ const float center_y,
+ const float box_width,
+ const float box_height,
+ const int width,
+ const int height)
{
float xmin = (center_x - box_width / 2.f) / width;
float ymin = (center_y - box_height / 2.f) / height;
float xmax = (center_x + box_width / 2.f) / width;
float ymax = (center_y + box_height / 2.f) / height;
- float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
- if(_info.clip())
+ float32x4_t vec_elements = {xmin, ymin, xmax, ymax};
+ if (_info.clip())
{
static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
@@ -112,7 +123,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
int img_width = _info.img_size().x;
int img_height = _info.img_size().y;
- if(img_width == 0 || img_height == 0)
+ if (img_width == 0 || img_height == 0)
{
img_width = _input2->info()->dimension(width_idx);
img_height = _input2->info()->dimension(height_idx);
@@ -120,7 +131,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
float step_x = _info.steps()[0];
float step_y = _info.steps()[1];
- if(step_x == 0.f || step_y == 0.f)
+ if (step_x == 0.f || step_y == 0.f)
{
step_x = static_cast<float>(img_width) / layer_width;
step_y = static_cast<float>(img_height) / layer_height;
@@ -130,74 +141,80 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
Iterator output(_output, slice);
- execute_window_loop(slice, [&](const Coordinates & id)
- {
- float center_x = 0;
- float center_y = 0;
- int idx = id.x() / (4 * num_priors);
- center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
- center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-
- float box_width;
- float box_height;
- int offset = 0;
-
- auto out = reinterpret_cast<float *>(output.ptr());
- for(unsigned int i = 0; i < _info.min_sizes().size(); ++i)
+ execute_window_loop(
+ slice,
+ [&](const Coordinates &id)
{
- const float min_size = _info.min_sizes().at(i);
- box_width = min_size;
- box_height = min_size;
- store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
- offset += 4;
-
- if(!_info.max_sizes().empty())
+ float center_x = 0;
+ float center_y = 0;
+ int idx = id.x() / (4 * num_priors);
+ center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+ center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
+
+ float box_width;
+ float box_height;
+ int offset = 0;
+
+ auto out = reinterpret_cast<float *>(output.ptr());
+ for (unsigned int i = 0; i < _info.min_sizes().size(); ++i)
{
- const float max_size = _info.max_sizes().at(i);
- box_width = std::sqrt(min_size * max_size);
- box_height = box_width;
-
+ const float min_size = _info.min_sizes().at(i);
+ box_width = min_size;
+ box_height = min_size;
store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
offset += 4;
- }
- // rest of priors
- for(auto ar : _info.aspect_ratios())
- {
- if(fabs(ar - 1.) < 1e-6)
+ if (!_info.max_sizes().empty())
{
- continue;
+ const float max_size = _info.max_sizes().at(i);
+ box_width = std::sqrt(min_size * max_size);
+ box_height = box_width;
+
+ store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+ offset += 4;
}
- box_width = min_size * sqrt(ar);
- box_height = min_size / sqrt(ar);
+ // rest of priors
+ for (auto ar : _info.aspect_ratios())
+ {
+ if (fabs(ar - 1.) < 1e-6)
+ {
+ continue;
+ }
- store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
- offset += 4;
+ box_width = min_size * sqrt(ar);
+ box_height = min_size / sqrt(ar);
+
+ store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+ offset += 4;
+ }
}
- }
- // set the variance
- out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
- float32x4_t var;
- if(_info.variances().size() == 1)
- {
- var = vdupq_n_f32(_info.variances().at(0));
- }
- else
- {
- const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
- var = vars;
- }
- for(int i = 0; i < num_priors; ++i)
- {
- vst1q_f32(out + 4 * i, var);
- }
- },
- output);
+ // set the variance
+ out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+ float32x4_t var;
+ if (_info.variances().size() == 1)
+ {
+ var = vdupq_n_f32(_info.variances().at(0));
+ }
+ else
+ {
+ const float32x4_t vars = {_info.variances().at(0), _info.variances().at(1), _info.variances().at(2),
+ _info.variances().at(3)};
+ var = vars;
+ }
+ for (int i = 0; i < num_priors; ++i)
+ {
+ vst1q_f32(out + 4 * i, var);
+ }
+ },
+ output);
}
-void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayerKernel::configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
@@ -215,7 +232,10 @@ void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *inpu
INEKernel::configure(win);
}
-Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
@@ -231,4 +251,4 @@ void NEPriorBoxLayerKernel::run(const Window &window, const ThreadInfo &info)
// Run function
calculate_prior_boxes(window);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
index 430a47f9f8..460f80e085 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
@@ -67,7 +67,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
@@ -84,7 +87,14 @@ private:
* @param[in] width Input width.
* @param[in] height Input height.
*/
- void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height);
+ void store_coordinates(float *out,
+ const int offset,
+ const float center_x,
+ const float center_y,
+ const float box_width,
+ const float box_height,
+ const int width,
+ const int height);
/** Function to calculate prior boxes.
*
* @param[in] window Input region on which to execute the kernel.
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index 46a0f625ce..8e1ed3a2a5 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
@@ -26,17 +26,17 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/NESymm.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
#include <map>
@@ -72,8 +72,8 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4
const int64_t b_3 = vgetlane(b_high, 1);
int64x2x2_t result;
- const int64x2_t result_0{ a_0 * b_0, a_1 * b_1 };
- const int64x2_t result_1{ a_2 * b_2, a_3 * b_3 };
+ const int64x2_t result_0{a_0 * b_0, a_1 * b_1};
+ const int64x2_t result_1{a_2 * b_2, a_3 * b_3};
result.val[0] = vadd(vmovl(vgetlow(bias)), result_0);
result.val[1] = vadd(vmovl(vgethigh(bias)), result_1);
@@ -81,15 +81,17 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4
}
} // namespace
-void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias)
+void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input,
+ ITensor *output,
+ const ITensor *weight,
+ const ITensor *bias)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
ARM_COMPUTE_ERROR_ON(input == output);
ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), weight->info(), bias->info()));
- static const std::map<DataType, ComputeFuncType> fn_map =
- {
- { DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16) },
+ static const std::map<DataType, ComputeFuncType> fn_map = {
+ {DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16)},
};
_input = input;
@@ -102,10 +104,10 @@ void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *o
_output->info()->set_quantization_info(compute_output_qinfo());
const UniformQuantizationInfo wq_info = _weight->info()->quantization_info().uniform();
- const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
+ const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
_output_shift *= -1;
- if(!bool(s))
+ if (!bool(s))
{
_output_multiplier = 0;
_output_shift = 0;
@@ -134,7 +136,10 @@ Window NEQLSTMLayerNormalizationKernel::configure_window(ITensor *target)
return window;
}
-Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *weight,
+ const ITensorInfo *bias)
{
ARM_COMPUTE_UNUSED(output, bias, weight, input);
@@ -151,7 +156,7 @@ Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().x() != weight->tensor_shape().x());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -182,11 +187,11 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
using AccType = int64_t;
using InputDataType = int16_t;
- AccType sum{ 0 };
- AccType sum_sq{ 0 };
+ AccType sum{0};
+ AccType sum_sq{0};
int32_t x = _window_start_x;
- for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+ for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
{
using namespace wrapper;
const int16x8_t val = vloadq(input_ptr + x);
@@ -216,7 +221,7 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
#endif // __aarch64__
}
- for(; x < _window_end_x; ++x)
+ for (; x < _window_end_x; ++x)
{
const InputDataType val = input_ptr[x];
sum += static_cast<AccType>(val);
@@ -230,7 +235,9 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
int16_t *output_ptr,
const int16_t *weight_ptr,
const int32_t *bias_ptr,
- int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift)
+ int32_t mean,
+ int32_t inv_std_mul,
+ int32_t inv_std_shift)
{
using OutputDataType = int16_t;
@@ -238,7 +245,7 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
const int32x4_t mean_vec = vdup_n(mean, wrapper::traits::vector_128_tag{});
int32_t x = _window_start_x;
- for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+ for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
{
const int16x8_t val = vloadq(input_ptr + x);
int32x4x2_t shifted;
@@ -267,16 +274,18 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
vstore(output_ptr + x + 4, vqmovn(out_val.val[1]));
}
- for(; x < _window_end_x; ++x)
+ for (; x < _window_end_x; ++x)
{
- const auto val = static_cast<int32_t>(input_ptr[x]);
- const int32_t shifted = (val << 10) - mean;
- const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
- const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x];
+ const auto val = static_cast<int32_t>(input_ptr[x]);
+ const int32_t shifted = (val << 10) - mean;
+ const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
+ const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x];
const auto reverse_shifted = static_cast<int32_t>((weighted + 512) >> 10);
- int32_t out_val = quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
- out_val = utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
- output_ptr[x] = static_cast<OutputDataType>(out_val);
+ int32_t out_val =
+ quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
+ out_val =
+ utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
+ output_ptr[x] = static_cast<OutputDataType>(out_val);
}
}
@@ -287,35 +296,38 @@ void NEQLSTMLayerNormalizationKernel::compute_qsymm16()
using BiasDataType = int32_t;
using AccType = int64_t;
- Iterator input_iterator{ _input, _inout_window };
- Iterator output_iterator{ _output, _inout_window };
- Iterator weight_iterator{ _weight, _weight_window };
- Iterator bias_iterator{ _bias, _weight_window };
+ Iterator input_iterator{_input, _inout_window};
+ Iterator output_iterator{_output, _inout_window};
+ Iterator weight_iterator{_weight, _weight_window};
+ Iterator bias_iterator{_bias, _weight_window};
const auto weight_ptr = reinterpret_cast<const InputDataType *>(weight_iterator.ptr());
const auto bias_ptr = reinterpret_cast<const BiasDataType *>(bias_iterator.ptr());
const uint32_t column_size = _input->info()->tensor_shape()[0];
- execute_window_loop(_inout_window, [ &, this](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
- auto out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
-
- AccType sum{ 0 };
- AccType sum_sq{ 0 };
- std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
-
- AccType mean{ 0 };
- AccType variance{ 0 };
- std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
-
- int32_t stddev_invsqrt_mul{};
- int32_t stddev_invsqrt_shift{};
- quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul, stddev_invsqrt_shift);
-
- normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
- },
- input_iterator, output_iterator);
+ execute_window_loop(
+ _inout_window,
+ [&, this](const Coordinates &)
+ {
+ const auto in_ptr = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
+ auto out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
+
+ AccType sum{0};
+ AccType sum_sq{0};
+ std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
+
+ AccType mean{0};
+ AccType variance{0};
+ std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
+
+ int32_t stddev_invsqrt_mul{};
+ int32_t stddev_invsqrt_shift{};
+ quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul,
+ stddev_invsqrt_shift);
+
+ normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
+ },
+ input_iterator, output_iterator);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
index a3ff6e988f..af5b6a0315 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
#include "src/core/NEON/INEKernel.h"
+
#include <functional>
namespace arm_compute
@@ -69,34 +70,26 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
private:
// constants
- static constexpr uint32_t max_input_dimension{ 2 }; /**< The maximum input dimension supported */
- static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */
- static constexpr uint32_t max_bias_dimension{ 1 }; /**< The maximum bias dimension supported */
- static constexpr uint32_t vector_size_byte{ 16 }; /**< Computation vector size in byte */
+ static constexpr uint32_t max_input_dimension{2}; /**< The maximum input dimension supported */
+ static constexpr uint32_t max_weight_dimension{1}; /**< The maximum weight dimension supported */
+ static constexpr uint32_t max_bias_dimension{1}; /**< The maximum bias dimension supported */
+ static constexpr uint32_t vector_size_byte{16}; /**< Computation vector size in byte */
using ComputeFuncType = std::function<void(NEQLSTMLayerNormalizationKernel &)>;
ComputeFuncType _fn{}; /**< Function pointer to computation function */
- const ITensor *_input
- {
- nullptr
- }; /**< Input tensor */
- const ITensor *_weight
- {
- nullptr
- }; /**< Weight tensor */
- const ITensor *_bias
- {
- nullptr
- }; /**< Bias tensor */
- ITensor *_output{ nullptr }; /**< Output tensor */
+ const ITensor *_input{nullptr}; /**< Input tensor */
+ const ITensor *_weight{nullptr}; /**< Weight tensor */
+ const ITensor *_bias{nullptr}; /**< Bias tensor */
+ ITensor *_output{nullptr}; /**< Output tensor */
int32_t _output_multiplier{}; /**< Multiplier for output values */
int32_t _output_shift{}; /**< Shift value for output values */
@@ -138,7 +131,9 @@ private:
int16_t *output_ptr,
const int16_t *weight_ptr,
const int32_t *bias_ptr,
- int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift);
+ int32_t mean,
+ int32_t inv_std_mul,
+ int32_t inv_std_shift);
/** Function to compute output quantization information */
QuantizationInfo compute_output_qinfo();
};
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index 802aebb526..486cd6d331 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
@@ -26,11 +26,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/misc/Utility.h"
-#include "src/core/CPP/Validate.h"
+#include "arm_compute/core/Window.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/roialign/list.h"
@@ -49,7 +50,12 @@ struct ROIAlignSelectorData
};
using ROIAlignSelctorPtr = std::add_pointer<bool(const ROIAlignSelectorData &data)>::type;
-using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)>::type;
+using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input,
+ ITensor *output,
+ const ITensor *rois,
+ ROIPoolingLayerInfo pool_info,
+ const Window &window,
+ const ThreadInfo &info)>::type;
struct ROIAlignKernel
{
@@ -58,31 +64,18 @@ struct ROIAlignKernel
ROIAlignUKernelPtr ukernel;
};
-static const ROIAlignKernel available_kernels[] =
-{
- {
- "fp32_neon_roialign",
- [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)
- },
+static const ROIAlignKernel available_kernels[] = {
+ {"fp32_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)},
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "fp16_neon_roialign",
- [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)
- },
+ {"fp16_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)},
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#if defined(ARM_COMPUTE_ENABLE_NEON)
- {
- "qu8_neon_roialign",
- [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)
- },
- {
- "qs8_neon_roialign",
- [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)
- },
+ {"qu8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8; },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)},
+ {"qs8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)},
#endif //defined(ARM_COMPUTE_ENABLE_NEON)
};
@@ -94,9 +87,9 @@ static const ROIAlignKernel available_kernels[] =
*/
const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -104,24 +97,29 @@ const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
return nullptr;
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+ output->tensor_shape());
}
- if(input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
+ if (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
@@ -143,13 +141,17 @@ NEROIAlignLayerKernel::NEROIAlignLayerKernel()
{
}
-void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayerKernel::configure(const ITensor *input,
+ const ITensor *rois,
+ ITensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
// Output auto inizialitation if not yet initialized
const TensorShape output_shape = compute_roi_align_shape(*input->info(), *rois->info(), pool_info);
- auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
output->info()->set_data_layout(input->info()->data_layout());
// Configure kernel window
@@ -167,7 +169,10 @@ void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois,
INEKernel::configure(window);
}
-Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
return Status{};
@@ -176,9 +181,9 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn
void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info)
{
const DataLayout data_layout = _input->info()->data_layout();
- if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
+ if (data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
{
- const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() });
+ const auto *uk = get_implementation(ROIAlignSelectorData{_input->info()->data_type()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
uk->ukernel(_input, _output, _rois, _pool_info, window, info);
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
index 48a3de7285..9cc538b429 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
@@ -83,7 +83,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 400e8291d6..1a3810fb56 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -22,9 +22,11 @@
* SOFTWARE.
*/
#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -36,7 +38,10 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, rois);
@@ -47,10 +52,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+ ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+ (output->dimension(1) != pool_info.pooled_height()));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
}
@@ -73,19 +79,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con
* @param[in] roi_indx Index of image of coordinate in output Tensor to store value
*/
template <typename T>
-void template_eval(const ITensor *input, const ITensor *output, int region_start_x, int region_start_y,
- int region_end_x, int region_end_y, int fm, int px, int py, int roi_batch, int roi_indx)
+void template_eval(const ITensor *input,
+ const ITensor *output,
+ int region_start_x,
+ int region_start_y,
+ int region_end_x,
+ int region_end_y,
+ int fm,
+ int px,
+ int py,
+ int roi_batch,
+ int roi_indx)
{
- if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+ if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
{
*reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = 0;
}
else
{
T curr_max = std::numeric_limits<T>::lowest(); // Min value of typename T
- for(int j = region_start_y; j < region_end_y; ++j)
+ for (int j = region_start_y; j < region_end_y; ++j)
{
- for(int i = region_start_x; i < region_end_x; ++i)
+ for (int i = region_start_x; i < region_end_x; ++i)
{
const auto val = *reinterpret_cast<const T *>(input->ptr_to_element(Coordinates(i, j, fm, roi_batch)));
curr_max = std::max(val, curr_max);
@@ -93,11 +108,13 @@ void template_eval(const ITensor *input, const ITensor *output, int region_start
}
// if quantized datatype, requantize then store in output tensor
- if(is_data_type_quantized(input->info()->data_type()))
+ if (is_data_type_quantized(input->info()->data_type()))
{
// covert qasymm to new output quantization scale and offset
- UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
- *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = quantize_qasymm8(curr_max, uqinfo);
+ UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(
+ input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
+ *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) =
+ quantize_qasymm8(curr_max, uqinfo);
}
else
{
@@ -112,13 +129,19 @@ NEROIPoolingLayerKernel::NEROIPoolingLayerKernel()
{
}
-Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
return Status{};
}
-void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayerKernel::configure(const ITensor *input,
+ const ITensor *rois,
+ const ITensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
@@ -126,12 +149,15 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *roi
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
// Output auto initialization if not yet initialized
- TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
+ TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+ rois->info()->dimension(1));
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), output->info()->quantization_info());
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ output->info()->quantization_info());
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) ||
+ (output->info()->dimension(1) != pool_info.pooled_height()));
// Set instance variables
_input = input;
@@ -167,7 +193,7 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
const auto *rois_ptr = reinterpret_cast<const uint16_t *>(_rois->buffer());
const auto data_type = _input->info()->data_type();
- for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+ for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
{
const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
const auto x1 = rois_ptr[values_per_roi * roi_indx + 1];
@@ -182,30 +208,35 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
const int roi_height = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f);
// Iterate through all feature maps
- for(int fm = 0; fm < fms; ++fm)
+ for (int fm = 0; fm < fms; ++fm)
{
// Iterate through all output pixels
- for(int py = 0; py < pooled_h; ++py)
+ for (int py = 0; py < pooled_h; ++py)
{
- for(int px = 0; px < pooled_w; ++px)
+ for (int px = 0; px < pooled_w; ++px)
{
auto region_start_x = static_cast<int>(std::floor((static_cast<float>(px) / pooled_w) * roi_width));
- auto region_end_x = static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
- auto region_start_y = static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
- auto region_end_y = static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
+ auto region_end_x =
+ static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
+ auto region_start_y =
+ static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
+ auto region_end_y =
+ static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width);
region_end_x = std::min(std::max(region_end_x + roi_anchor_x, 0), width);
region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height);
region_end_y = std::min(std::max(region_end_y + roi_anchor_y, 0), height);
- switch(data_type)
+ switch (data_type)
{
case DataType::F32:
- template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+ template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x,
+ region_end_y, fm, px, py, roi_batch, roi_indx);
break;
case DataType::QASYMM8:
- template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+ template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x,
+ region_end_y, fm, px, py, roi_batch, roi_indx);
break;
default:
ARM_COMPUTE_ERROR("DataType not Supported");
@@ -216,4 +247,4 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
}
}
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
index e7a7e90eef..81f6006ea2 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
@@ -63,7 +63,8 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor.
*/
- void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
+ void
+ configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
@@ -82,7 +83,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
private:
const ITensor *_input;
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index ec63a35de9..87b7b76b72 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -29,11 +29,12 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/range/list.h"
namespace arm_compute
@@ -55,48 +56,23 @@ struct RangeUKernel
RangeUKernelPtr ukernel;
};
-static const RangeUKernel available_kernels[] =
-{
- {
- "fp16_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)
- },
- {
- "f32_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)
- },
- {
- "u8_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::U8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)
- },
- {
- "u16_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::U16; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)
- },
- {
- "u32_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::U32; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)
- },
- {
- "s8_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::S8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)
- },
- {
- "s16_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::S16; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)
- },
- {
- "s32_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::S32; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)
- },
+static const RangeUKernel available_kernels[] = {
+ {"fp16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)},
+ {"f32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)},
+ {"u8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)},
+ {"u16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)},
+ {"u32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)},
+ {"s8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)},
+ {"s16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)},
+ {"s32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)},
};
/** Micro-kernel selector
@@ -107,9 +83,9 @@ static const RangeUKernel available_kernels[] =
*/
const RangeUKernel *get_implementation(const RangeSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -119,28 +95,31 @@ const RangeUKernel *get_implementation(const RangeSelectorData &data)
Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
{
- const auto *uk = get_implementation(RangeSelectorData{ output.data_type() });
+ const auto *uk = get_implementation(RangeSelectorData{output.data_type()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()),
+ "start value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()),
+ "end value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()),
+ "step value is outside the range of the data type");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+ "Output tensor size is incorrect");
return Status{};
}
} // namespace
-NERangeKernel::NERangeKernel()
- : _start(0), _end(1), _step(1), _output(nullptr)
+NERangeKernel::NERangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
{
}
@@ -151,7 +130,8 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step));
// Auto initialize output if not initialized
- auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, output->info()->data_type(), output->info()->quantization_info());
+ auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1,
+ output->info()->data_type(), output->info()->quantization_info());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
@@ -178,7 +158,7 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() });
+ const auto *uk = get_implementation(RangeSelectorData{_output->info()->data_type()});
uk->ukernel(_output, _start, _step, window);
}
diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h
index 90560995e6..fa555c2c2e 100644
--- a/src/core/NEON/kernels/NERangeKernel.h
+++ b/src/core/NEON/kernels/NERangeKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NERANGEKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 19955af493..455d604b3b 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -28,16 +28,17 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/NEON/NEMath.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "support/SaturateCast.h"
-#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
namespace arm_compute
@@ -48,7 +49,7 @@ namespace
template <typename T>
void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0)
{
- if(std::is_same<T, uint8_t>::value)
+ if (std::is_same<T, uint8_t>::value)
{
auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2));
wrapper::vstore(output.ptr() + offset, res);
@@ -63,8 +64,8 @@ void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset
template <typename T>
uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
{
- uint32x4_t mask{ 0 };
- if(op == ReductionOperation::ARG_IDX_MIN)
+ uint32x4_t mask{0};
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
mask = wrapper::vcgt(b, a);
}
@@ -73,12 +74,12 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp
mask = wrapper::vclt(b, a);
}
- uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
- if(axis != 0)
+ uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3};
+ if (axis != 0)
{
vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
}
- uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } };
+ uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}};
return res;
}
@@ -86,9 +87,9 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp
template <typename T>
uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
{
- uint32x4x4_t mask{ { 0 } };
- uint8x16_t mask_u8{ 0 };
- if(op == ReductionOperation::ARG_IDX_MIN)
+ uint32x4x4_t mask{{0}};
+ uint8x16_t mask_u8{0};
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
mask_u8 = wrapper::vcgt(b, a);
}
@@ -96,44 +97,43 @@ uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, R
{
mask_u8 = wrapper::vclt(b, a);
}
- auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
- auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
- mask.val[0] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
- mask.val[1] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
- mask.val[2] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
- mask.val[3] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
-
- uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
- { idx + 4, idx + 5, idx + 6, idx + 7 },
- { idx + 8, idx + 9, idx + 10, idx + 11 },
- { idx + 12, idx + 13, idx + 14, idx + 15 }
- }
- };
- if(axis != 0)
+ auto wide_u16_1 =
+ wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+ auto wide_u16_2 =
+ wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+ mask.val[0] =
+ wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+ mask.val[1] =
+ wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+ mask.val[2] =
+ wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+ mask.val[3] =
+ wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+
+ uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3},
+ {idx + 4, idx + 5, idx + 6, idx + 7},
+ {idx + 8, idx + 9, idx + 10, idx + 11},
+ {idx + 12, idx + 13, idx + 14, idx + 15}}};
+ if (axis != 0)
{
vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
}
- uint32x4x4_t res =
- {
- {
- vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
- vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
- vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
- vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
- }
- };
+ uint32x4x4_t res = {
+ {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+ vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}};
return res;
}
// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
- typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
- calculate_min(T in)
+inline typename std::enable_if<
+ std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+ typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_min(T in)
{
auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
return wrapper::vpmin(pmin, pmin);
@@ -141,9 +141,10 @@ inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_
// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
- typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
- calculate_min(T in)
+inline typename std::enable_if<
+ std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+ typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_min(T in)
{
auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
pmin = wrapper::vpmin(pmin, pmin);
@@ -153,9 +154,10 @@ inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_s
// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
- typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
- calculate_max(T in)
+inline typename std::enable_if<
+ std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+ typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_max(T in)
{
auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
return wrapper::vpmax(pmax, pmax);
@@ -163,9 +165,10 @@ inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_
// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
- typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
- calculate_max(T in)
+inline typename std::enable_if<
+ std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+ typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_max(T in)
{
auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
pmax = wrapper::vpmax(pmax, pmax);
@@ -176,10 +179,10 @@ inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_s
template <typename T>
uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
{
- uint32x4_t res_idx_mask{ 0 };
+ uint32x4_t res_idx_mask{0};
uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
- if(op == ReductionOperation::ARG_IDX_MIN)
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
auto pmin = calculate_min(vec_res_value);
auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -203,10 +206,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, Reduc
template <typename T>
uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
{
- uint32x4x4_t res_idx_mask{ { 0 } };
+ uint32x4x4_t res_idx_mask{{0}};
uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
- uint8x16_t mask_u8{ 0 };
- if(op == ReductionOperation::ARG_IDX_MIN)
+ uint8x16_t mask_u8{0};
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
auto pmin = calculate_min(vec_res_value);
mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -218,12 +221,18 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va
}
// Widen vectors
- auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
- auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
- auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
- auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
- auto wide_u32_3 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
- auto wide_u32_4 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+ auto wide_u16_1 =
+ wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+ auto wide_u16_2 =
+ wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+ auto wide_u32_1 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+ auto wide_u32_2 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+ auto wide_u32_3 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+ auto wide_u32_4 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
@@ -241,19 +250,19 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va
pmin = wrapper::vpmin(pmin, pmin);
res = std::min(wrapper::vgetlane(pmin, 0), res);
iter++;
- }
- while(iter < 4);
+ } while (iter < 4);
return (res - 0xFFFFFFFF);
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <>
-uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+uint32x4x4_t
+calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
{
- uint32x4x2_t mask{ 0 };
- uint16x8_t mask_u16{ 0 };
- if(op == ReductionOperation::ARG_IDX_MIN)
+ uint32x4x2_t mask{0};
+ uint16x8_t mask_u16{0};
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
mask_u16 = wrapper::vcgt(b, a);
}
@@ -263,19 +272,14 @@ uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x
}
mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16));
mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16));
- uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
- { idx + 4, idx + 5, idx + 6, idx + 7 }
- }
- };
- if(axis != 0)
+ uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}};
+ if (axis != 0)
{
vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
}
- uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
- wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
- 0, 0
- };
+ uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+ wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0};
return res;
}
@@ -298,10 +302,10 @@ inline float16x4_t calculate_max(float16x8_t in)
template <>
uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
{
- uint32x4x2_t res_idx_mask{ 0 };
+ uint32x4x2_t res_idx_mask{0};
uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
uint16x8_t mask_u16;
- if(op == ReductionOperation::ARG_IDX_MIN)
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
auto pmin = calculate_min(vec_res_value);
mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -313,8 +317,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va
}
// Widen vectors
- auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
- auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+ auto wide_u32_1 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+ auto wide_u32_2 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
@@ -328,8 +334,7 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va
pmin = wrapper::vpmin(pmin, pmin);
res = std::min(wrapper::vgetlane(pmin, 0), res);
iter++;
- }
- while(iter < 2);
+ } while (iter < 2);
return (res - 0xFFFFFFFF);
}
@@ -388,7 +393,8 @@ struct RedOpX
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
- inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+ inline void operator()(
+ const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
{
const size_t input_dim_0 = in->info()->dimension(0);
const int window_step_x = 16 / sizeof(T);
@@ -402,211 +408,217 @@ struct RedOpX
Iterator output(out, out_window);
execute_window_loop(
- in_win_no_pad, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-
- auto init_res_value = static_cast<T>(0.f);
- switch(op)
+ in_win_no_pad,
+ [&](const Coordinates &)
{
- case ReductionOperation::ARG_IDX_MAX:
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::MIN:
- case ReductionOperation::MAX:
- {
- init_res_value = static_cast<T>(*input_ptr);
- break;
- }
- case ReductionOperation::PROD:
- {
- init_res_value = static_cast<T>(1.f);
- break;
- }
- default:
- break;
- }
- auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
- uint32x4x4_t vec_res_idx{ { 0 } };
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vec_elements = wrapper::vloadq(input_ptr + x);
- switch(op)
+ auto init_res_value = static_cast<T>(0.f);
+ switch (op)
{
- case ReductionOperation::SUM_SQUARE:
- vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
- break;
- case ReductionOperation::MEAN_SUM:
- case ReductionOperation::SUM:
- vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
- break;
- case ReductionOperation::PROD:
- vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
- break;
- case ReductionOperation::ARG_IDX_MIN:
- {
- auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
- vec_res_value = temp_vec_res_value;
- break;
- }
case ReductionOperation::ARG_IDX_MAX:
- {
- auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
- vec_res_value = temp_vec_res_value;
- break;
- }
+ case ReductionOperation::ARG_IDX_MIN:
case ReductionOperation::MIN:
+ case ReductionOperation::MAX:
{
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ init_res_value = static_cast<T>(*input_ptr);
break;
}
- case ReductionOperation::MAX:
+ case ReductionOperation::PROD:
{
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ init_res_value = static_cast<T>(1.f);
break;
}
default:
- ARM_COMPUTE_ERROR("Not supported");
+ break;
}
- }
+ auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+ uint32x4x4_t vec_res_idx{{0}};
- switch(op)
- {
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- case ReductionOperation::SUM_SQUARE:
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
- auto res = static_cast<T>(0.f);
- for(int i = 0; i < S; ++i)
+ const auto vec_elements = wrapper::vloadq(input_ptr + x);
+ switch (op)
{
- res += wrapper::vgetlane(vec_res_value, i);
+ case ReductionOperation::SUM_SQUARE:
+ vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+ break;
+ case ReductionOperation::MEAN_SUM:
+ case ReductionOperation::SUM:
+ vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::PROD:
+ vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+ vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+ vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
-#else // ARM_COMPUTE_DEBUG_ENABLED
- auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
- for(int i = 0; i < S / 4; ++i)
+ }
+
+ switch (op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ case ReductionOperation::SUM_SQUARE:
{
- carry_res = wrapper::vpadd(carry_res, carry_res);
- }
- auto res = wrapper::vgetlane(carry_res, 0);
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+ auto res = static_cast<T>(0.f);
+ for (int i = 0; i < S; ++i)
+ {
+ res += wrapper::vgetlane(vec_res_value, i);
+ }
+#else // ARM_COMPUTE_DEBUG_ENABLED
+ auto carry_res =
+ wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ for (int i = 0; i < S / 4; ++i)
+ {
+ carry_res = wrapper::vpadd(carry_res, carry_res);
+ }
+ auto res = wrapper::vgetlane(carry_res, 0);
#endif // ARM_COMPUTE_DEBUG_ENABLED
- if(op == ReductionOperation::SUM_SQUARE)
- {
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ if (op == ReductionOperation::SUM_SQUARE)
+ {
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ res += (*(input_ptr + x)) * (*(input_ptr + x));
+ }
+ }
+ else
{
- res += (*(input_ptr + x)) * (*(input_ptr + x));
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ res += *(input_ptr + x);
+ }
}
+
+ if (op == ReductionOperation::MEAN_SUM)
+ {
+ res /= input_dim_0;
+ }
+
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
- else
+ case ReductionOperation::PROD:
{
+ auto carry_res =
+ wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ T res = 1;
+ for (int i = 0; i < S / 2; ++i)
+ {
+ res *= wrapper::vgetlane(carry_res, i);
+ }
+
// Compute left-over elements
- for(; x < window_end_x; ++x)
+ for (; x < window_end_x; ++x)
{
- res += *(input_ptr + x);
+ res *= *(input_ptr + x);
}
- }
- if(op == ReductionOperation::MEAN_SUM)
- {
- res /= input_dim_0;
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
-
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::PROD:
- {
- auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
- T res = 1;
- for(int i = 0; i < S / 2; ++i)
+ case ReductionOperation::ARG_IDX_MIN:
{
- res *= wrapper::vgetlane(carry_res, i);
- }
+ auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- res *= *(input_ptr + x);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ if (*(input_ptr + x) < res)
+ {
+ idx = x;
+ res = *(input_ptr + x);
+ }
+ }
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+ break;
}
-
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::ARG_IDX_MIN:
- {
- auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
- auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ case ReductionOperation::ARG_IDX_MAX:
{
- if(*(input_ptr + x) < res)
+ auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- idx = x;
- res = *(input_ptr + x);
+ if (*(input_ptr + x) > res)
+ {
+ idx = x;
+ res = *(input_ptr + x);
+ }
}
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+ break;
}
- *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
- break;
- }
- case ReductionOperation::ARG_IDX_MAX:
- {
- auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
- auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ case ReductionOperation::MIN:
{
- if(*(input_ptr + x) > res)
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- idx = x;
- res = *(input_ptr + x);
+ res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
}
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
- *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
- break;
- }
- case ReductionOperation::MIN:
- {
- auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ case ReductionOperation::MAX:
{
- res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
- }
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::MAX:
- {
- auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+ }
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- },
- input, output);
+ },
+ input, output);
}
};
template <typename T>
struct RedOpX_quantized
{
- inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+ inline void operator()(
+ const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
{
using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
@@ -637,246 +649,257 @@ struct RedOpX_quantized
const float B = out_offset - (in_scale * in_offset) / (out_scale);
execute_window_loop(
- in_win_no_pad, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+ in_win_no_pad,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+
+ auto vec_res_value1 =
+ wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+ auto vec_res_value2 =
+ wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+ auto vec_res_value3 =
+ wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+ auto vec_res_value4 =
+ wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+
+ auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
+ auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
+ auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
+ auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+
+ typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = {0};
+
+ if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN ||
+ op == ReductionOperation::MIN || op == ReductionOperation::MAX)
+ {
+ vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
+ }
+
+ uint32x4x4_t vec_res_idx{{0}};
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vec_elements = wrapper::vloadq(input_ptr + x);
+ switch (op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
- auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
- auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
- auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
- auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
- auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
- auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
- auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
- auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+ vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+ vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+ vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+ vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+ break;
+ }
+ case ReductionOperation::PROD:
+ {
+ const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
+ const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale);
- typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = { 0 };
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
- if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX)
- {
- vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
- }
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
- uint32x4x4_t vec_res_idx{ { 0 } };
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vec_elements = wrapper::vloadq(input_ptr + x);
- switch(op)
- {
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- {
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
- vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
- vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
- vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
- break;
- }
- case ReductionOperation::PROD:
- {
- const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
- const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale);
-
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
- auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
- auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
- auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
-
- //de-quantize vec_elements
- temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
- temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
- temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
- temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
- vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
- vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
- vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
- vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
- break;
+ auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+ auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+ auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+ auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+ //de-quantize vec_elements
+ temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+ temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+ temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+ temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+ vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+ vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+ vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+ vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(
+ x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(
+ x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
+ }
+
+ switch (op)
+ {
case ReductionOperation::ARG_IDX_MIN:
{
- auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
- vec_res_value = temp_vec_res_value;
+ auto idx =
+ calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ if (*(input_ptr + x) < res)
+ {
+ idx = x;
+ res = *(input_ptr + x);
+ }
+ }
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
break;
}
case ReductionOperation::ARG_IDX_MAX:
{
- auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
- vec_res_value = temp_vec_res_value;
+ auto idx =
+ calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ if (*(input_ptr + x) > res)
+ {
+ idx = x;
+ res = *(input_ptr + x);
+ }
+ }
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
break;
}
case ReductionOperation::MIN:
{
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- break;
- }
- case ReductionOperation::MAX:
- {
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
-
- switch(op)
- {
- case ReductionOperation::ARG_IDX_MIN:
- {
- auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
- auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- if(*(input_ptr + x) < res)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- idx = x;
- res = *(input_ptr + x);
+ res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
}
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
- *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
- break;
- }
- case ReductionOperation::ARG_IDX_MAX:
- {
- auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
- auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ case ReductionOperation::MAX:
{
- if(*(input_ptr + x) > res)
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- idx = x;
- res = *(input_ptr + x);
+ res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
}
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
- *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
- break;
- }
- case ReductionOperation::MIN:
- {
- auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ case ReductionOperation::PROD:
{
- res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
- }
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::MAX:
- {
- auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+ auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
+ carry_res = wrapper::vmul(carry_res, vec_res_value3_f);
+ carry_res = wrapper::vmul(carry_res, vec_res_value4_f);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
- }
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::PROD:
- {
- auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
- carry_res = wrapper::vmul(carry_res, vec_res_value3_f);
- carry_res = wrapper::vmul(carry_res, vec_res_value4_f);
+ float res = wrapper::vgetlane(carry_res, 0);
+ res *= wrapper::vgetlane(carry_res, 1);
+ res *= wrapper::vgetlane(carry_res, 2);
+ res *= wrapper::vgetlane(carry_res, 3);
- float res = wrapper::vgetlane(carry_res, 0);
- res *= wrapper::vgetlane(carry_res, 1);
- res *= wrapper::vgetlane(carry_res, 2);
- res *= wrapper::vgetlane(carry_res, 3);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ //de-quantize input
+ if (std::is_same<T, uint8_t>::value)
+ {
+ res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+ }
+ else
+ {
+ res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+ }
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- //de-quantize input
- if(std::is_same<T, uint8_t>::value)
+ //re-quantize result
+ if (std::is_same<T, uint8_t>::value)
{
- res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+ res = quantize_qasymm8(res, iq_info);
}
else
{
- res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+ res = quantize_qasymm8_signed(res, iq_info);
}
- }
- //re-quantize result
- if(std::is_same<T, uint8_t>::value)
- {
- res = quantize_qasymm8(res, iq_info);
+ *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
+ break;
}
- else
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
{
- res = quantize_qasymm8_signed(res, iq_info);
- }
+ auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+ carry_res = wrapper::vadd(carry_res, vec_res_value3);
+ carry_res = wrapper::vadd(carry_res, vec_res_value4);
- *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
- break;
- }
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- {
- auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
- carry_res = wrapper::vadd(carry_res, vec_res_value3);
- carry_res = wrapper::vadd(carry_res, vec_res_value4);
+ auto carry_paddition =
+ wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+ carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
+ auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
- auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
- carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
- auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ res += *(input_ptr + x);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- res += *(input_ptr + x);
- }
+ if (op == ReductionOperation::MEAN_SUM)
+ {
+ const int32_t resFinal = A * (static_cast<float>(res)) + B;
- if(op == ReductionOperation::MEAN_SUM)
- {
- const int32_t resFinal = A * (static_cast<float>(res)) + B;
+ *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
+ }
+ else
+ {
+ // Subtract accumulated offsets
+ res -= (in_info.dimension(0) - 1) * iq_info.offset;
+ *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+ }
- *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
- }
- else
- {
- // Subtract accumulated offsets
- res -= (in_info.dimension(0) - 1) * iq_info.offset;
- *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+ break;
}
-
- break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- },
- input, output);
+ },
+ input, output);
}
};
@@ -887,7 +910,12 @@ struct RedOpYZW
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
- inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
+ inline void operator()(const Window &in_window,
+ Window &out_window,
+ const ITensor *in,
+ ITensor *out,
+ int axis,
+ const ReductionOperation op)
{
const TensorInfo in_info = *(in->info());
const int window_step_x = 16 / sizeof(T);
@@ -900,203 +928,210 @@ struct RedOpYZW
Window in_win_no_pad = in_window;
in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
Window out_win_no_pad = out_window;
- out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+ out_win_no_pad.set(Window::DimX,
+ Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
Iterator input(in, in_win_no_pad);
Iterator output(out, out_win_no_pad);
execute_window_loop(
- in_win_no_pad, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ in_win_no_pad,
+ [&](const Coordinates &)
{
- neon_vector vec_res_value = { 0 };
- switch(op)
- {
- case ReductionOperation::ARG_IDX_MAX:
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::MIN:
- case ReductionOperation::MAX:
- {
- vec_res_value = wrapper::vloadq(input_ptr + x);
- break;
- }
- case ReductionOperation::PROD:
- {
- vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
- break;
- }
- default:
- {
- vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
- break;
- }
- }
- uint32x4x4_t vec_res_idx{ { 0 } };
+ const auto input_ptr = reinterpret_cast<T *>(input.ptr());
- for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
- const auto vec_elements = wrapper::vloadq(in_ptr);
- switch(op)
+ neon_vector vec_res_value = {0};
+ switch (op)
{
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
- break;
- case ReductionOperation::SUM_SQUARE:
- vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
- break;
- case ReductionOperation::PROD:
- vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
- break;
- case ReductionOperation::ARG_IDX_MIN:
- {
- auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
- vec_res_value = temp_vec_res_value;
- break;
- }
case ReductionOperation::ARG_IDX_MAX:
+ case ReductionOperation::ARG_IDX_MIN:
+ case ReductionOperation::MIN:
+ case ReductionOperation::MAX:
{
- auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
- vec_res_value = temp_vec_res_value;
+ vec_res_value = wrapper::vloadq(input_ptr + x);
break;
}
- case ReductionOperation::MIN:
+ case ReductionOperation::PROD:
{
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
break;
}
- case ReductionOperation::MAX:
+ default:
{
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
break;
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
}
- }
-
- if(op == ReductionOperation::MEAN_SUM)
- {
- auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
- vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
- }
+ uint32x4x4_t vec_res_idx{{0}};
- if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
- {
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- if(std::is_same<T, float16_t>::value)
+ for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
{
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
+ const T *in_ptr =
+ reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+ const auto vec_elements = wrapper::vloadq(in_ptr);
+ switch (op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::SUM_SQUARE:
+ vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+ break;
+ case ReductionOperation::PROD:
+ vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx =
+ calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx =
+ calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- }
- else
- {
- wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
- }
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- auto res_value = 0.f;
- switch(op)
- {
- case ReductionOperation::ARG_IDX_MAX:
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::MIN:
- case ReductionOperation::MAX:
+ if (op == ReductionOperation::MEAN_SUM)
{
- res_value = *(input_ptr + x);
- break;
+ auto vec_width_inv =
+ wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
+ vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
}
- case ReductionOperation::PROD:
+
+ if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
{
- res_value = static_cast<T>(1.f);
- break;
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ if (std::is_same<T, float16_t>::value)
+ {
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
+ }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
}
- default:
+ else
{
- res_value = static_cast<T>(0.f);
- break;
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
}
}
- uint32_t res_idx = 0;
- for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
-
- switch(op)
+ auto res_value = 0.f;
+ switch (op)
{
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- res_value += *in_ptr;
- break;
- case ReductionOperation::SUM_SQUARE:
- res_value += *in_ptr * *in_ptr;
- break;
- case ReductionOperation::PROD:
- res_value *= *in_ptr;
- break;
+ case ReductionOperation::ARG_IDX_MAX:
case ReductionOperation::ARG_IDX_MIN:
+ case ReductionOperation::MIN:
+ case ReductionOperation::MAX:
{
- if(*in_ptr < res_value)
- {
- res_value = *in_ptr;
- res_idx = dim;
- }
+ res_value = *(input_ptr + x);
break;
}
- case ReductionOperation::ARG_IDX_MAX:
+ case ReductionOperation::PROD:
{
- if(*in_ptr > res_value)
- {
- res_value = *in_ptr;
- res_idx = dim;
- }
+ res_value = static_cast<T>(1.f);
break;
}
- case ReductionOperation::MIN:
+ default:
{
- res_value = *in_ptr < res_value ? *in_ptr : res_value;
+ res_value = static_cast<T>(0.f);
break;
}
- case ReductionOperation::MAX:
+ }
+
+ uint32_t res_idx = 0;
+ for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ const T *in_ptr =
+ reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+
+ switch (op)
{
- res_value = *in_ptr > res_value ? *in_ptr : res_value;
- break;
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ res_value += *in_ptr;
+ break;
+ case ReductionOperation::SUM_SQUARE:
+ res_value += *in_ptr * *in_ptr;
+ break;
+ case ReductionOperation::PROD:
+ res_value *= *in_ptr;
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ if (*in_ptr < res_value)
+ {
+ res_value = *in_ptr;
+ res_idx = dim;
+ }
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ if (*in_ptr > res_value)
+ {
+ res_value = *in_ptr;
+ res_idx = dim;
+ }
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ res_value = *in_ptr < res_value ? *in_ptr : res_value;
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ res_value = *in_ptr > res_value ? *in_ptr : res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
}
- }
- if(op == ReductionOperation::MEAN_SUM)
- {
- res_value /= in_info.dimension(axis);
- }
+ if (op == ReductionOperation::MEAN_SUM)
+ {
+ res_value /= in_info.dimension(axis);
+ }
- if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
- {
- *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
- }
- else
- {
- *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+ if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+ {
+ *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
+ }
+ else
+ {
+ *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+ }
}
- }
- },
- input, output);
+ },
+ input, output);
}
};
@@ -1107,7 +1142,8 @@ struct RedOpYZW_complex
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
- inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
+ inline void operator()(
+ const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
{
ARM_COMPUTE_ERROR_ON(axis != 2);
ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM);
@@ -1124,70 +1160,77 @@ struct RedOpYZW_complex
Window in_win_no_pad = in_window;
in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
Window out_win_no_pad = out_window;
- out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+ out_win_no_pad.set(Window::DimX,
+ Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
Iterator input(in, in_win_no_pad);
Iterator output(out, out_win_no_pad);
execute_window_loop(
- in_win_no_pad, [&](const Coordinates &)
- {
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ in_win_no_pad,
+ [&](const Coordinates &)
{
- neon_vector vec_res_value_0 = { 0 };
- neon_vector vec_res_value_1 = { 0 };
-
- vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
- vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
- T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
- for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
- T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
+ neon_vector vec_res_value_0 = {0};
+ neon_vector vec_res_value_1 = {0};
- const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
- const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
+ vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
- vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
- vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
- }
+ T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+ for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+ T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
- wrapper::vstore(out_ptr, vec_res_value_0);
- wrapper::vstore(out_ptr + 4, vec_res_value_1);
- }
+ const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
+ const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- auto res_value_0 = 0.f;
- auto res_value_1 = 0.f;
+ vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
+ vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
+ }
- T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
- for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ wrapper::vstore(out_ptr, vec_res_value_0);
+ wrapper::vstore(out_ptr + 4, vec_res_value_1);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
- res_value_0 += *in_ptr;
- res_value_1 += *(in_ptr + 1);
+ auto res_value_0 = 0.f;
+ auto res_value_1 = 0.f;
+
+ T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+ for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+ res_value_0 += *in_ptr;
+ res_value_1 += *(in_ptr + 1);
+ }
+ *out_ptr = res_value_0;
+ *(out_ptr + 1) = res_value_1;
}
- *out_ptr = res_value_0;
- *(out_ptr + 1) = res_value_1;
- }
- },
- input, output);
+ },
+ input, output);
}
};
template <typename T>
struct RedOpYZW_quantized
{
- inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
+ inline void operator()(const Window &in_window,
+ Window &out_window,
+ const ITensor *in,
+ ITensor *out,
+ int axis,
+ const ReductionOperation op)
{
const TensorInfo in_info = *(in->info());
const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
- using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
+ using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
const auto oq_info = out->info()->quantization_info().uniform();
@@ -1201,12 +1244,14 @@ struct RedOpYZW_quantized
Window in_win_no_pad = in_window;
in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
Window out_win_no_pad = out_window;
- out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+ out_win_no_pad.set(Window::DimX,
+ Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
Iterator input(in, in_win_no_pad);
Iterator output(out, out_win_no_pad);
- using vector_type = typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
+ using vector_type =
+ typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type;
vector_type vec_res_value1{};
@@ -1234,362 +1279,384 @@ struct RedOpYZW_quantized
const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{});
execute_window_loop(
- in_win_no_pad, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ in_win_no_pad,
+ [&](const Coordinates &)
{
- uint32x4x4_t vec_res_idx{ { 0 } };
- vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
- vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
- vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
- vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+ const auto input_ptr = reinterpret_cast<T *>(input.ptr());
- vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
- vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
- vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
- vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ uint32x4x4_t vec_res_idx{{0}};
+ vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+ vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+ vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+ vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
- auto vec_res_value = wrapper::vloadq(input_ptr + x);
+ vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
- for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
- {
- const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
- const auto vec_elements = wrapper::vloadq(in_ptr);
- switch(op)
+ auto vec_res_value = wrapper::vloadq(input_ptr + x);
+
+ for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
{
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
+ const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
+ const auto vec_elements = wrapper::vloadq(in_ptr);
+ switch (op)
{
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
- vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
- vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
- vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
- break;
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+ vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+ vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+ vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+ break;
+ }
+ case ReductionOperation::PROD:
+ {
+ const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset),
+ wrapper::traits::vector_128_tag{});
+ const auto scale32x4f_4 =
+ wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
+
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+ auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+ auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+ auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+ //de-quantize vec_elements
+ temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+ temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+ temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+ temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+ vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
+ vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
+ vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
+ vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+ vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+ vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- case ReductionOperation::PROD:
- {
- const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
- const auto scale32x4f_4 = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
-
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
- auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
- auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
- auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+ }
- //de-quantize vec_elements
- temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
- temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
- temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
- temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
- vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
- vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
- vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
- vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
- break;
- }
+ switch (op)
+ {
case ReductionOperation::ARG_IDX_MIN:
- {
- auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
- vec_res_value = temp_vec_res_value;
- break;
- }
case ReductionOperation::ARG_IDX_MAX:
{
- auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
- vec_res_value = temp_vec_res_value;
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12,
+ vec_res_idx.val[3]);
break;
}
case ReductionOperation::MIN:
- {
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- break;
- }
case ReductionOperation::MAX:
{
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
break;
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
-
- switch(op)
- {
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::ARG_IDX_MAX:
- {
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]);
- break;
- }
- case ReductionOperation::MIN:
- case ReductionOperation::MAX:
- {
- wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
- break;
- }
- case ReductionOperation::SUM:
- {
- // Subtract offsets
- auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
+ case ReductionOperation::SUM:
+ {
+ // Subtract offsets
+ auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
- auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
- auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
- auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
- auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
+ auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
+ auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
+ auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
+ auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
- vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
- vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
- vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
- vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
+ vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
+ vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
+ vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
+ vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
- const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
- const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
+ const auto temp16x8t_1 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
+ const auto temp16x8t_2 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
- combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
- break;
- }
- case ReductionOperation::MEAN_SUM:
- {
- vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
- vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
- vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
- vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
+ combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
+ break;
+ }
+ case ReductionOperation::MEAN_SUM:
+ {
+ vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
+ vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
+ vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
+ vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
#ifdef __aarch64__
- vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
- vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
- vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
- vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
+ vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
+ vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
+ vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
+ vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
#else // defined(__aarch64__)
- vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f);
- vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f);
- vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f);
- vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f);
+ vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f);
+ vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f);
+ vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f);
+ vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f);
#endif // __aarch64__
- const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
- const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
- auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-
- wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
- break;
- }
- case ReductionOperation::PROD:
- {
- const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
- const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
-
- //re-quantize
- vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
- vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
- vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
- vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
-
- vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
- vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
- vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
- vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
+ const auto temp16x8t_1 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+ const auto temp16x8t_2 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+ auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
- const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
- const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
- auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-
- wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
- break;
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
+ break;
+ }
+ case ReductionOperation::PROD:
+ {
+ const auto offset32x4f_4 =
+ wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
+ const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
+
+ //re-quantize
+ vec_res_value1_f =
+ wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
+ vec_res_value2_f =
+ wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
+ vec_res_value3_f =
+ wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
+ vec_res_value4_f =
+ wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
+
+ vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
+ vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
+ vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
+ vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
+
+ const auto temp16x8t_1 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+ const auto temp16x8t_2 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+ auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
}
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- float res_value = 0.f;
- int32_t res_value_q = 0;
-
- switch(op)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- case ReductionOperation::ARG_IDX_MAX:
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::MIN:
- case ReductionOperation::MAX:
- {
- res_value = *(input_ptr + x);
- break;
- }
- case ReductionOperation::PROD:
- {
- res_value = static_cast<T>(1.0f);
- break;
- }
- default:
- {
- res_value = static_cast<T>(0.0f);
- break;
- }
- }
- uint32_t res_idx = 0;
+ float res_value = 0.f;
+ int32_t res_value_q = 0;
- for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
- {
- const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
- switch(op)
+ switch (op)
{
- case ReductionOperation::SUM:
+ case ReductionOperation::ARG_IDX_MAX:
+ case ReductionOperation::ARG_IDX_MIN:
+ case ReductionOperation::MIN:
+ case ReductionOperation::MAX:
{
- res_value += *in_ptr;
+ res_value = *(input_ptr + x);
break;
}
- case ReductionOperation::MEAN_SUM:
+ case ReductionOperation::PROD:
{
- res_value_q += *in_ptr;
+ res_value = static_cast<T>(1.0f);
break;
}
- case ReductionOperation::SUM_SQUARE:
+ default:
{
- res_value += *in_ptr * *in_ptr;
+ res_value = static_cast<T>(0.0f);
break;
}
- case ReductionOperation::PROD:
+ }
+ uint32_t res_idx = 0;
+
+ for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ const T *in_ptr =
+ reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
+ switch (op)
{
- //de-quantize input
- if(std::is_same<T, uint8_t>::value)
+ case ReductionOperation::SUM:
{
- res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+ res_value += *in_ptr;
+ break;
}
- else
+ case ReductionOperation::MEAN_SUM:
{
- res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+ res_value_q += *in_ptr;
+ break;
}
- break;
- }
- case ReductionOperation::ARG_IDX_MIN:
- {
- if(*in_ptr < res_value)
+ case ReductionOperation::SUM_SQUARE:
{
- res_value = *in_ptr;
- res_idx = dim;
+ res_value += *in_ptr * *in_ptr;
+ break;
}
- break;
- }
- case ReductionOperation::ARG_IDX_MAX:
- {
- if(*in_ptr > res_value)
+ case ReductionOperation::PROD:
{
- res_value = *in_ptr;
- res_idx = dim;
+ //de-quantize input
+ if (std::is_same<T, uint8_t>::value)
+ {
+ res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+ }
+ else
+ {
+ res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+ }
+ break;
}
- break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ if (*in_ptr < res_value)
+ {
+ res_value = *in_ptr;
+ res_idx = dim;
+ }
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ if (*in_ptr > res_value)
+ {
+ res_value = *in_ptr;
+ res_idx = dim;
+ }
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ res_value = *in_ptr < res_value ? *in_ptr : res_value;
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ res_value = *in_ptr > res_value ? *in_ptr : res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- case ReductionOperation::MIN:
+ }
+
+ switch (op)
+ {
+ case ReductionOperation::MEAN_SUM:
{
- res_value = *in_ptr < res_value ? *in_ptr : res_value;
+ // Apply previously calculated coefficients (with rounding on aarch64)
+#ifdef __aarch64__
+ const int32_t res =
+ arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
+#else // defined(__aarch64__)
+ const int32_t res = A * (static_cast<float>(res_value_q)) + B;
+#endif // __aarch64__
+ *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
break;
}
- case ReductionOperation::MAX:
+ case ReductionOperation::SUM:
{
- res_value = *in_ptr > res_value ? *in_ptr : res_value;
+ // Subtract accumulated offsets
+ res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
+ *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
break;
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
-
- switch(op)
- {
- case ReductionOperation::MEAN_SUM:
- {
- // Apply previously calculated coefficients (with rounding on aarch64)
-#ifdef __aarch64__
- const int32_t res = arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
-#else // defined(__aarch64__)
- const int32_t res = A * (static_cast<float>(res_value_q)) + B;
-#endif // __aarch64__
- *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
- break;
- }
- case ReductionOperation::SUM:
- {
- // Subtract accumulated offsets
- res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
- *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
- break;
- }
- case ReductionOperation::PROD:
- {
- //re-quantize result
- T res = 0;
- if(std::is_same<T, uint8_t>::value)
+ case ReductionOperation::PROD:
{
- res = quantize_qasymm8(res_value, iq_info);
+ //re-quantize result
+ T res = 0;
+ if (std::is_same<T, uint8_t>::value)
+ {
+ res = quantize_qasymm8(res_value, iq_info);
+ }
+ else
+ {
+ res = quantize_qasymm8_signed(res_value, iq_info);
+ }
+ *(reinterpret_cast<T *>(output.ptr() + x)) = res;
+ break;
}
- else
+ case ReductionOperation::ARG_IDX_MIN:
+ case ReductionOperation::ARG_IDX_MAX:
{
- res = quantize_qasymm8_signed(res_value, iq_info);
+ *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
+ break;
}
- *(reinterpret_cast<T *>(output.ptr() + x)) = res;
- break;
- }
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::ARG_IDX_MAX:
- {
- *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
- break;
+ default:
+ *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
}
- default:
- *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
}
- }
- },
- input, output);
+ },
+ input, output);
}
};
-void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
+void reduce_op(
+ const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
{
const bool is_complex = (input->info()->num_channels() == 2);
- if(is_complex)
+ if (is_complex)
{
- switch(axis)
+ switch (axis)
{
case 2:
- switch(input->info()->data_type())
+ switch (input->info()->data_type())
{
case DataType::F32:
- switch(op)
+ switch (op)
{
case ReductionOperation::SUM:
- return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
+ return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(
+ window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(),
+ op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -1602,19 +1669,21 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
return;
}
- switch(axis)
+ switch (axis)
{
case 0:
{
- switch(input->info()->data_type())
+ switch (input->info()->data_type())
{
case DataType::QASYMM8:
{
- return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op);
+ return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output,
+ RedOpX_quantized<uint8_t>(), op);
}
case DataType::QASYMM8_SIGNED:
{
- return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op);
+ return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(),
+ op);
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
@@ -1635,19 +1704,22 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
}
}
case 1:
- switch(input->info()->data_type())
+ switch (input->info()->data_type())
{
case DataType::QASYMM8:
{
- return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+ return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output,
+ RedOpYZW_quantized<uint8_t>(), op);
}
case DataType::QASYMM8_SIGNED:
{
- return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+ return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output,
+ RedOpYZW_quantized<int8_t>(), op);
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(),
+ op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1657,15 +1729,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
ARM_COMPUTE_ERROR("Not supported");
}
case 2:
- switch(input->info()->data_type())
+ switch (input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+ return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output,
+ RedOpYZW_quantized<uint8_t>(), op);
case DataType::QASYMM8_SIGNED:
- return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+ return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output,
+ RedOpYZW_quantized<int8_t>(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(),
+ op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1675,15 +1750,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
ARM_COMPUTE_ERROR("Not supported");
}
case 3:
- switch(input->info()->data_type())
+ switch (input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+ return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output,
+ RedOpYZW_quantized<uint8_t>(), op);
case DataType::QASYMM8_SIGNED:
- return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+ return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output,
+ RedOpYZW_quantized<int8_t>(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(),
+ op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1704,9 +1782,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- if(input->num_channels() == 1)
+ if (input->num_channels() == 1)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::S32, DataType::F16, DataType::F32);
}
else
{
@@ -1715,13 +1794,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON(axis != 2);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
- if(!is_arg_min_max)
+ if (!is_arg_min_max)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
@@ -1731,8 +1811,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
}
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
- const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+ const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
}
@@ -1745,7 +1826,10 @@ NEReductionOperationKernel::NEReductionOperationKernel()
{
}
-void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+void NEReductionOperationKernel::configure(const ITensor *input,
+ ITensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -1761,14 +1845,23 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
INEKernel::configure(win);
// Calculate output shape and set if empty
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
// Output auto initialization if not yet initialized
const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
DataType output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
}
-Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status NEReductionOperationKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h
index 08e654fd21..78bec62c14 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.h
@@ -77,7 +77,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp
index 1a7f58bb08..f92a4c87da 100644
--- a/src/core/NEON/kernels/NEReorderKernel.cpp
+++ b/src/core/NEON/kernels/NEReorderKernel.cpp
@@ -24,11 +24,13 @@
#if defined(__aarch64__)
#include "src/core/NEON/kernels/NEReorderKernel.h"
-#include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/arm_gemm/transform.hpp"
+
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/arm_gemm/transform.hpp"
+
namespace arm_compute
{
@@ -37,29 +39,32 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- switch(_input->info()->data_type())
+ switch (_input->info()->data_type())
{
case DataType::F32:
{
const int ksize_rows_elements = _xmax * _ksize;
- const int jump_rows = ksize_rows_elements * window.x().start();
- const int k_start = window.x().start() * _ksize;
- const int k_end = std::min(window.x().end() * _ksize, _kmax);
- const int stride = _kmax;
- if(k_start < k_end)
+ const int jump_rows = ksize_rows_elements * window.x().start();
+ const int k_start = window.x().start() * _ksize;
+ const int k_end = std::min(window.x().end() * _ksize, _kmax);
+ const int stride = _kmax;
+ if (k_start < k_end)
{
-
- switch(_output_wf)
+ switch (_output_wf)
{
case WeightFormat::OHWIo4:
{
- arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+ arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(
+ reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+ reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
break;
}
#if defined(ARM_COMPUTE_ENABLE_SVE)
case WeightFormat::OHWIo8:
{
- arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+ arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(
+ reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+ reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
break;
}
#endif /* ARM_COMPUTE_ENABLE_SVE */
@@ -78,11 +83,20 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
}
NEReorderKernel::NEReorderKernel()
- : _input(nullptr), _output(nullptr), _ksize(0), _kmax(0), _xmax(0), _input_wf(WeightFormat::ANY), _output_wf(WeightFormat::ANY)
+ : _input(nullptr),
+ _output(nullptr),
+ _ksize(0),
+ _kmax(0),
+ _xmax(0),
+ _input_wf(WeightFormat::ANY),
+ _output_wf(WeightFormat::ANY)
{
}
-void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+void NEReorderKernel::configure(const ITensor *input,
+ ITensor *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf)
{
ARM_COMPUTE_LOG_PARAMS(input, output, input_wf, output_wf);
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -96,7 +110,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
// Setting parameters for transform
auto dims = input->info()->num_dimensions();
- switch(dims)
+ switch (dims)
{
case 2:
{
@@ -120,7 +134,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
// Window size is set by rows / _ksize
Window win;
int window_size = 0;
- switch(_output_wf)
+ switch (_output_wf)
{
#if defined(ARM_COMPUTE_ENABLE_SVE)
case WeightFormat::OHWIo8:
@@ -142,7 +156,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
break;
}
}
- if(_kmax % _ksize != 0)
+ if (_kmax % _ksize != 0)
{
window_size += 1;
}
@@ -152,11 +166,14 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
INEKernel::configure(win);
}
-Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+Status NEReorderKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- if(output->tensor_shape().total_size() != 0)
+ if (output->tensor_shape().total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -167,20 +184,20 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
int output_x_dim;
int output_k_dim;
auto dims = output->num_dimensions();
- switch(dims)
+ switch (dims)
{
case 2:
{
- input_x_dim = input->dimension(0); // Number of columns in input matrix
- input_k_dim = input->dimension(1); // Number of rows in input matrix
+ input_x_dim = input->dimension(0); // Number of columns in input matrix
+ input_k_dim = input->dimension(1); // Number of rows in input matrix
output_x_dim = output->dimension(0); // Number of columns in output matrix
output_k_dim = output->dimension(1); // Number of rows in output matrix
break;
}
case 4:
{
- input_x_dim = input->dimension(2); // Number of columns in input matrix
- input_k_dim = input->dimension(3); // Number of rows in input matrix
+ input_x_dim = input->dimension(2); // Number of columns in input matrix
+ input_k_dim = input->dimension(3); // Number of rows in input matrix
output_x_dim = output->dimension(2); // Number of columns in output matrix
output_k_dim = output->dimension(3); // Number of rows in output matrix
break;
@@ -192,7 +209,7 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
}
int ksize;
- switch(output_wf)
+ switch (output_wf)
{
case WeightFormat::OHWIo8:
{
@@ -216,11 +233,10 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_kdim != output_k_dim);
// output x_dim needs to be same as input
ARM_COMPUTE_RETURN_ERROR_ON(input_x_dim != output_x_dim);
-
}
return Status{};
}
} // namespace arm_compute
-#endif // defined(__aarch64__) \ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/NEReorderKernel.h b/src/core/NEON/kernels/NEReorderKernel.h
index 07908890f4..4528b25245 100644
--- a/src/core/NEON/kernels/NEReorderKernel.h
+++ b/src/core/NEON/kernels/NEReorderKernel.h
@@ -26,9 +26,10 @@
#ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
#define ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
-#include "src/core/NEON/INEKernel.h"
#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
namespace arm_compute
{
@@ -36,7 +37,6 @@ namespace arm_compute
class NEReorderKernel : public INEKernel
{
public:
-
const char *name() const override
{
return "NEReorderKernel";
@@ -62,7 +62,10 @@ public:
* @param[in] input_wf WeightFormat of input.
* @param[in] output_wf WeightFormat of output.
*/
- void configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+ void configure(const ITensor *input,
+ ITensor *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf);
/** Static function to check if given info will lead to a valid configuration of @ref NEReorderKernel
*
@@ -73,25 +76,27 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
-
-/*****************************************************************************/
+ /*****************************************************************************/
private:
- const ITensor *_input{nullptr}; // Input tensor
- ITensor *_output{nullptr}; // Output tensor
- int32_t _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call
- int32_t _kmax{0}; // Rows in input tensor
- int32_t _xmax{0}; // Columns in input tensor
- WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor
- WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor
+ const ITensor *_input{nullptr}; // Input tensor
+ ITensor *_output{nullptr}; // Output tensor
+ int32_t _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call
+ int32_t _kmax{0}; // Rows in input tensor
+ int32_t _xmax{0}; // Columns in input tensor
+ WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor
+ WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor
};
} // namespace arm_compute
#endif /* ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL */
-#endif // defined(__aarch64__) \ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index a7b830c066..227570405c 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -28,8 +28,9 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -50,13 +51,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+ "The width of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+ "The height of the input tensor must be a multiple of stride");
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+ const TensorInfo tensor_info_output =
+ output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -65,8 +69,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
}
} // namespace
-NEReorgLayerKernel::NEReorgLayerKernel()
- : _input(nullptr), _output(nullptr), _stride(1)
+NEReorgLayerKernel::NEReorgLayerKernel() : _input(nullptr), _output(nullptr), _stride(1)
{
}
@@ -121,23 +124,26 @@ void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
Iterator out(_output, collapsed_window);
// Perform reorg
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- // Get spatial coords and channels
- const unsigned int w = id[idx_w];
- const unsigned int h = id[idx_h];
- const unsigned int c = id[idx_c];
-
- // Calculate mapping
- const unsigned int offset = c / out_c;
- Coordinates map_coords = id;
- map_coords.set(idx_w, w * stride + offset % stride);
- map_coords.set(idx_h, h * stride + offset / stride);
- map_coords.set(idx_c, c % out_c);
-
- // Perform mapping
- std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size());
- },
- out);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ // Get spatial coords and channels
+ const unsigned int w = id[idx_w];
+ const unsigned int h = id[idx_h];
+ const unsigned int c = id[idx_c];
+
+ // Calculate mapping
+ const unsigned int offset = c / out_c;
+ Coordinates map_coords = id;
+ map_coords.set(idx_w, w * stride + offset % stride);
+ map_coords.set(idx_h, h * stride + offset / stride);
+ map_coords.set(idx_c, c % out_c);
+
+ // Perform mapping
+ std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords),
+ _input->info()->element_size());
+ },
+ out);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index ca6c117882..d2437eecd0 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -26,15 +26,17 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
{
ARM_COMPUTE_UNUSED(use_inverted_axis);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
@@ -42,11 +44,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Current implementation only supports up to 4 dimensions.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
+ "Current implementation only supports up to 4 dimensions.");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -57,8 +60,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-NEReverseKernel::NEReverseKernel()
- : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false)
+NEReverseKernel::NEReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false)
{
}
@@ -80,7 +82,10 @@ void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITe
INEKernel::configure(calculate_max_window(*output->info()));
}
-Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
+Status NEReverseKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *axis,
+ bool use_inverted_axis)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis));
@@ -88,29 +93,30 @@ Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
}
template <typename T>
-void run_reverse(const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis)
+void run_reverse(
+ const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis)
{
unsigned int axis_bit = 0;
const int rank = input->info()->num_dimensions();
- for(unsigned int i = 0; i < axis->info()->dimension(0); ++i)
+ for (unsigned int i = 0; i < axis->info()->dimension(0); ++i)
{
int axis_i = *(reinterpret_cast<const int *>(axis->buffer()) + i);
// The values of axis tensor must be between [-rank, rank-1].
- if((axis_i < -rank) || (axis_i >= rank))
+ if ((axis_i < -rank) || (axis_i >= rank))
{
ARM_COMPUTE_ERROR("the valuses of the axis tensor must be within [-rank, rank-1].");
}
// In case of negative axis value i.e targeted axis(i) = rank + axis(i)
- if(axis_i < 0)
+ if (axis_i < 0)
{
axis_i = rank + axis_i;
}
// Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis
- if(use_inverted_axis)
+ if (use_inverted_axis)
{
axis_i = (rank - 1) - axis_i;
}
@@ -127,43 +133,47 @@ void run_reverse(const Window &window, const ITensor *input, const ITensor *axis
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator input_it(input, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
-
- // Reverse 0 axis
- if(axis_bit & 0x1)
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- in = wrapper::vrev64(in);
- in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
+
+ // Reverse 0 axis
+ if (axis_bit & 0x1)
+ {
+ in = wrapper::vrev64(in);
+ in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ }
+
+ const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
+ const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+ const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+ const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+
+ auto out_ptr =
+ reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
+ wrapper::vstore(out_ptr, in);
}
- const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
- const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
- const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
- const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
-
- auto out_ptr = reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
- wrapper::vstore(out_ptr, in);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
- const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
- const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
- const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
- const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+ const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
+ const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+ const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+ const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
- *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = in;
- }
- },
- input_it);
+ *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) =
+ in;
+ }
+ },
+ input_it);
}
void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
@@ -172,7 +182,7 @@ void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- switch(_input->info()->element_size())
+ switch (_input->info()->element_size())
{
case 4:
run_reverse<uint32_t>(window, _input, _axis, _output, _use_inverted_axis);
diff --git a/src/core/NEON/kernels/NEReverseKernel.h b/src/core/NEON/kernels/NEReverseKernel.h
index 7d9ec4691c..92261887f4 100644
--- a/src/core/NEON/kernels/NEReverseKernel.h
+++ b/src/core/NEON/kernels/NEReverseKernel.h
@@ -68,7 +68,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index b8c9b244ee..7789b828ea 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -29,13 +29,12 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/select/list.h"
#include <arm_neon.h>
@@ -54,7 +53,8 @@ struct SelectKernelSelectorData
};
using SelectorPtr = std::add_pointer<bool(const SelectKernelSelectorData &data)>::type;
-using KernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
+using KernelPtr =
+ std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
struct SelectKernelSelector
{
@@ -63,95 +63,62 @@ struct SelectKernelSelector
KernelPtr ukernel;
};
-static const SelectKernelSelector available_kernels[] =
-{
- {
- "neon_s8_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)
- },
- {
- "neon_s16_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)
- },
- {
- "neon_s32_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)
- },
- {
- "neon_u8_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)
- },
- {
- "neon_u16_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)
- },
- {
- "neon_u32_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)
- },
- {
- "neon_s8_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)
- },
- {
- "neon_s16_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)
- },
- {
- "neon_s32_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)
- },
- {
- "neon_u8_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)
- },
- {
- "neon_u16_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)
- },
- {
- "neon_u32_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)
- },
- {
- "neon_f16_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)
- },
- {
- "neon_f16_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)
- },
- {
- "neon_f32_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)
- },
- {
- "neon_f32_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)
- },
+static const SelectKernelSelector available_kernels[] = {
+ {"neon_s8_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)},
+ {"neon_s16_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)},
+ {"neon_s32_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)},
+ {"neon_u8_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)},
+ {"neon_u16_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)},
+ {"neon_u32_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)},
+ {"neon_s8_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)},
+ {"neon_s16_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)},
+ {"neon_s32_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)},
+ {"neon_u8_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)},
+ {"neon_u16_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)},
+ {"neon_u32_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)},
+ {"neon_f16_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)},
+ {"neon_f16_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)},
+ {"neon_f32_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)},
+ {"neon_f32_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)},
};
const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -184,7 +151,8 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor
INEKernel::configure(win);
}
-Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x);
@@ -195,9 +163,11 @@ Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, cons
const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
- ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+ ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+ ((c->tensor_shape().num_dimensions() > 1) ||
+ (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -214,7 +184,7 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON(_output == nullptr);
ARM_COMPUTE_ERROR_ON(_output->info() == nullptr);
- const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank });
+ const auto *uk = get_implementation(SelectKernelSelectorData{_output->info()->data_type(), _has_same_rank});
ARM_COMPUTE_ERROR_ON(uk == nullptr);
ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
uk->ukernel(_c, _x, _y, _output, window);
diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h
index e82105a68e..4fec42b536 100644
--- a/src/core/NEON/kernels/NESelectKernel.h
+++ b/src/core/NEON/kernels/NESelectKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NESELECTKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -82,7 +83,6 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
private:
-
const ITensor *_c; /**< Condition tensor */
const ITensor *_x; /**< Source tensor 1 */
const ITensor *_y; /**< Source tensor 2 */
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
index 673eace3c1..da023aeb96 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
@@ -26,11 +26,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
#include <cstdint>
@@ -41,19 +42,22 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *block_info,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const DataLayout data_layout = input->data_layout();
const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -64,7 +68,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
return Status{};
}
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -73,9 +81,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+ TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+ input, block_shape_x, block_shape_y, padding_left, padding_right);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -86,14 +95,25 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
} // namespace
NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel()
- : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _padding_left(), _block_shape_x(), _block_shape_y()
+ : _input(nullptr),
+ _block_shape(nullptr),
+ _paddings(nullptr),
+ _output(nullptr),
+ _data_layout(DataLayout::UNKNOWN),
+ _padding_left(),
+ _block_shape_x(),
+ _block_shape_y()
{
}
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+ const ITensor *block_shape,
+ const ITensor *paddings,
+ ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
_input = input;
_block_shape = block_shape;
@@ -106,15 +126,22 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *b
ICPPKernel::configure(win);
}
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
- ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+ input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+ padding_right, output->info()));
_input = input;
_output = output;
@@ -128,15 +155,23 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_
INEKernel::configure(win);
}
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
return Status{};
}
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
return Status{};
}
@@ -146,17 +181,17 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- if(_block_shape != nullptr)
+ if (_block_shape != nullptr)
{
// Retrieve the block shapes dynamically
_block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
_block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1)));
}
- if(_paddings != nullptr)
+ if (_paddings != nullptr)
{
- const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 0, 0 }));
- const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 1, 0 }));
+ const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({0, 0}));
+ const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({1, 0}));
_padding_left = Size2D(pad_left_x, pad_left_y);
}
const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
@@ -173,57 +208,61 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info
int batch_id = 0;
// Main loop for NCHW and NHWC
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
- const size_t out_x = id.x();
- const size_t out_y = id.y();
- const size_t z = id.z();
- const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
- const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
- if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
{
- const int w = batch_id % batch_size;
- const int in_x = pos_x - _padding_left.x();
- const int in_y = pos_y - _padding_left.y();
- Coordinates input_coords{ in_x, in_y, z, w };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- }
- },
- out);
+ const size_t out_x = id.x();
+ const size_t out_y = id.y();
+ const size_t z = id.z();
+ const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+ const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+ if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+ pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+ {
+ const int w = batch_id % batch_size;
+ const int in_x = pos_x - _padding_left.x();
+ const int in_y = pos_y - _padding_left.y();
+ Coordinates input_coords{in_x, in_y, z, w};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ }
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
else
{
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
- const size_t out_x = id.y();
- const size_t out_y = id.z();
- const size_t z = id.x();
- const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
- const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
- if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
{
- const int w = batch_id % batch_size;
- const int in_x = pos_x - _padding_left.x();
- const int in_y = pos_y - _padding_left.y();
- Coordinates input_coords{ z, in_x, in_y, w };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- }
- },
- out);
+ const size_t out_x = id.y();
+ const size_t out_y = id.z();
+ const size_t z = id.x();
+ const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+ const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+ if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+ pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+ {
+ const int w = batch_id % batch_size;
+ const int in_x = pos_x - _padding_left.x();
+ const int in_y = pos_y - _padding_left.y();
+ Coordinates input_coords{z, in_x, in_y, w};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ }
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
index 44b8cbb514..6292c07136 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -69,7 +70,12 @@ public:
* @param[in] padding_right The padding at the end of every dimension of the output tensor.
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+ void configure(const ITensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -79,7 +85,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings)
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -91,7 +100,12 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ const ITensorInfo *output);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 7687c50c40..b49c5ee344 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -26,11 +26,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
#include <cstdint>
@@ -50,7 +51,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const DataLayout data_layout = input->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -115,43 +116,45 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info
int batch_id = 0;
// Main loop for NCHW and NHWC
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
- const size_t channel_id = id.z();
- const size_t in_x = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
- const size_t in_y = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
- const int z = channel_id % channel_size;
- Coordinates input_coords{ in_x, in_y, z, batch_id };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- },
- out);
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
+ {
+ const size_t channel_id = id.z();
+ const size_t in_x = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
+ const size_t in_y = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
+ const int z = channel_id % channel_size;
+ Coordinates input_coords{in_x, in_y, z, batch_id};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
else
{
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
- const size_t channel_id = id.x();
- const size_t in_x = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
- const size_t in_y = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
- const int z = channel_id % channel_size;
- Coordinates input_coords{ z, in_x, in_y, batch_id };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- },
- out);
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
+ {
+ const size_t channel_id = id.x();
+ const size_t in_x = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
+ const size_t in_y = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
+ const int z = channel_id % channel_size;
+ Coordinates input_coords{z, in_x, in_y, batch_id};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
index 953b68a401..7d147c5b94 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 93080e2ac7..e23b40a9aa 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -25,13 +25,13 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -41,7 +41,11 @@ using namespace arm_compute::misc::shape_calculator;
namespace
{
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
// Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -50,9 +54,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+ compute_stack_shape(*input, axis, num_tensors));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
@@ -60,7 +65,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
{
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -71,11 +77,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
return std::make_pair(Status{}, win);
}
-inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
+inline Coordinates
+shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
{
constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D
Coordinates id_out = id;
- for(unsigned int i = max_out_coord - 1; i > axis; --i)
+ for (unsigned int i = max_out_coord - 1; i > axis; --i)
{
id_out.set(i, id[i - 1]);
}
@@ -84,12 +91,12 @@ inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id,
}
} // namespace
-NEStackLayerKernel::NEStackLayerKernel()
- : _input(nullptr), _output(nullptr), _axis(), _idx_input()
+NEStackLayerKernel::NEStackLayerKernel() : _input(nullptr), _output(nullptr), _axis(), _idx_input()
{
}
-void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
+void NEStackLayerKernel::configure(
+ const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -106,10 +113,15 @@ void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsi
INEKernel::configure(win_config.second);
}
-Status NEStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status NEStackLayerKernel::validate(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
return Status{};
}
@@ -131,12 +143,15 @@ void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info)
const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0;
const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0;
- execute_window_loop(window, [&](const Coordinates & id)
- {
- Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
- const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
- std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
- },
- input);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
+ const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w +
+ id_out[4] * stride_k;
+ std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
+ },
+ input);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h
index 9b36518e4d..685812b56d 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.h
+++ b/src/core/NEON/kernels/NEStackLayerKernel.h
@@ -26,6 +26,7 @@
#define ARM_COMPUTE_NESTACKLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -64,7 +65,8 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input.
*
*/
- void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
+ void configure(
+ const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel
*
* @note Supported input tensor rank: up to 4
@@ -78,7 +80,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output);
// Inherited methods overridden
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
index 2b406a8b8b..efff51be9d 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
@@ -26,9 +26,10 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Window.h"
+
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -38,9 +39,14 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -49,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
- {
- return i == 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
// Get expected output shape
- const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+ *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
// Checks output if configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -71,14 +74,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
// Output tensor auto initialization if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+ *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
// Create window
@@ -88,38 +95,49 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
}
} // namespace
-NEStridedSliceKernel::NEStridedSliceKernel()
- : _starts_abs(), _final_strides(), _shrink_mask()
+NEStridedSliceKernel::NEStridedSliceKernel() : _starts_abs(), _final_strides(), _shrink_mask()
{
}
-void NEStridedSliceKernel::configure(const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSliceKernel::configure(const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
_shrink_mask = shrink_axis_mask;
const TensorShape &input_shape = input->tensor_shape();
Coordinates ends_abs;
- std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
- input_shape,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ std::tie(_starts_abs, ends_abs, _final_strides) =
+ arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+ begin_mask, end_mask, shrink_axis_mask);
// Configure kernel window
- auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ auto win_config =
+ validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
}
-Status NEStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSliceKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
- starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), starts, ends,
+ strides, begin_mask, end_mask, shrink_axis_mask)
+ .first);
return Status{};
}
@@ -156,7 +174,7 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co
size_t length_x = win.shape()[0];
- if(_final_strides[0] == 1 && !is_shrink_x)
+ if (_final_strides[0] == 1 && !is_shrink_x)
{
win.set(Window::DimX, Window::Dimension(0, 1, 1));
width_size = width_size * length_x;
@@ -183,16 +201,17 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co
uint8_t *cur_ptr;
execute_window_loop(
- win, [&](const Coordinates & id)
- {
- cur_ptr = input_base;
- cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0;
- cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1;
- cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2;
- cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3;
-
- std::copy_n(cur_ptr, width_size, output_it.ptr());
- },
- output_it);
+ win,
+ [&](const Coordinates &id)
+ {
+ cur_ptr = input_base;
+ cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0;
+ cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1;
+ cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2;
+ cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3;
+
+ std::copy_n(cur_ptr, width_size, output_it.ptr());
+ },
+ output_it);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.h b/src/core/NEON/kernels/NEStridedSliceKernel.h
index 9ce517417d..a475f09a17 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.h
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
#include <cstdint>
@@ -68,9 +69,14 @@ public:
* @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- void configure(const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+ void configure(const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask);
/** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel
*
@@ -86,9 +92,14 @@ public:
* @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
index 94256dc12d..577ce5b69e 100644
--- a/src/core/NEON/kernels/NETileKernel.cpp
+++ b/src/core/NEON/kernels/NETileKernel.cpp
@@ -27,9 +27,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -43,15 +44,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
- {
- return e == 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -59,8 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-NETileKernel::NETileKernel()
- : _input(nullptr), _output(nullptr)
+NETileKernel::NETileKernel() : _input(nullptr), _output(nullptr)
{
}
@@ -95,8 +93,9 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- Window output_window{ window };
- output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), _input->info()->dimension(0)));
+ Window output_window{window};
+ output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(),
+ _input->info()->dimension(0)));
Window out_slice = output_window.first_slice_window_1D();
const auto src_shape = _input->info()->tensor_shape();
@@ -104,17 +103,19 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info)
{
Iterator output_it(_output, out_slice);
- execute_window_loop(out_slice, [&](const Coordinates & id)
- {
- const size_t x = id.x();
- const size_t y = id.y();
- const size_t z = id.z();
- const size_t w = id[3];
- Coordinates input_coords{ x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3] };
- memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), _input->info()->dimension(0) * _input->info()->element_size());
- },
- output_it);
- }
- while(output_window.slide_window_slice_1D(out_slice));
+ execute_window_loop(
+ out_slice,
+ [&](const Coordinates &id)
+ {
+ const size_t x = id.x();
+ const size_t y = id.y();
+ const size_t z = id.z();
+ const size_t w = id[3];
+ Coordinates input_coords{x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3]};
+ memcpy(output_it.ptr(), _input->ptr_to_element(input_coords),
+ _input->info()->dimension(0) * _input->info()->element_size());
+ },
+ output_it);
+ } while (output_window.slide_window_slice_1D(out_slice));
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index dbd47ccfa9..13c2d314e4 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -38,9 +38,8 @@ struct DepthwiseConfig
DepthwiseMethod method = DepthwiseMethod::DEFAULT;
std::string filter = "";
- DepthwiseConfig(DepthwiseMethod method)
- : method(method) {};
- DepthwiseConfig() {};
+ DepthwiseConfig(DepthwiseMethod method) : method(method){};
+ DepthwiseConfig(){};
};
struct DepthwiseArgs
@@ -63,18 +62,24 @@ struct DepthwiseArgs
bool fast_mode = false;
- DepthwiseArgs(
- const CPUInfo *cpu_info,
- unsigned int kernel_rows, unsigned int kernel_cols,
- unsigned int stride_rows, unsigned int stride_cols,
- unsigned int dilation_rows, unsigned int dilation_cols,
- unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
- unsigned int input_channels,
- unsigned int output_rows, unsigned int output_cols,
- unsigned int channel_multiplier,
- PaddingValues padding, arm_gemm::Activation activation,
-
- const DepthwiseConfig *config)
+ DepthwiseArgs(const CPUInfo *cpu_info,
+ unsigned int kernel_rows,
+ unsigned int kernel_cols,
+ unsigned int stride_rows,
+ unsigned int stride_cols,
+ unsigned int dilation_rows,
+ unsigned int dilation_cols,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int input_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ unsigned int channel_multiplier,
+ PaddingValues padding,
+ arm_gemm::Activation activation,
+
+ const DepthwiseConfig *config)
: cpu_info(cpu_info),
kernel_rows(kernel_rows),
kernel_cols(kernel_cols),
@@ -95,20 +100,38 @@ struct DepthwiseArgs
{
}
- DepthwiseArgs(
- const CPUInfo *cpu_info,
- unsigned int kernel_rows, unsigned int kernel_cols,
- unsigned int stride_rows, unsigned int stride_cols,
- unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
- unsigned int input_channels,
- unsigned int output_rows, unsigned int output_cols,
- unsigned int channel_multiplier,
- PaddingValues padding, arm_gemm::Activation activation,
- const DepthwiseConfig *config)
- : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows,
- stride_cols, 1, 1, n_batches, input_rows, input_cols,
- input_channels, output_rows, output_cols,
- channel_multiplier, padding, activation, config)
+ DepthwiseArgs(const CPUInfo *cpu_info,
+ unsigned int kernel_rows,
+ unsigned int kernel_cols,
+ unsigned int stride_rows,
+ unsigned int stride_cols,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int input_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ unsigned int channel_multiplier,
+ PaddingValues padding,
+ arm_gemm::Activation activation,
+ const DepthwiseConfig *config)
+ : DepthwiseArgs(cpu_info,
+ kernel_rows,
+ kernel_cols,
+ stride_rows,
+ stride_cols,
+ 1,
+ 1,
+ n_batches,
+ input_rows,
+ input_cols,
+ input_channels,
+ output_rows,
+ output_cols,
+ channel_multiplier,
+ padding,
+ activation,
+ config)
{
}
};
@@ -127,17 +150,18 @@ struct Tile
{
}
- Tile()
- : Tile(nullptr, 0, 0, 0)
+ Tile() : Tile(nullptr, 0, 0, 0)
{
}
- void load_from(
- const TInput *input,
- const unsigned int ld_row, const unsigned int ld_col,
- const unsigned int n_rows, const unsigned int n_cols,
- const int input_i, const int input_j,
- const unsigned int channel_multiplier) const
+ void load_from(const TInput *input,
+ const unsigned int ld_row,
+ const unsigned int ld_col,
+ const unsigned int n_rows,
+ const unsigned int n_cols,
+ const int input_i,
+ const int input_j,
+ const unsigned int channel_multiplier) const
{
const auto pad_top = input_i < 0 ? -input_i : 0;
const auto pad_left = input_j < 0 ? -input_j : 0;
@@ -145,18 +169,15 @@ struct Tile
const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top;
const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left;
- if(padded_rows < tile_rows || padded_cols < tile_cols)
+ if (padded_rows < tile_rows || padded_cols < tile_cols)
{
memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput));
}
- do_premultiply<TInput>(
- (TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col,
- ld_row, ld_col,
- array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
- tile_cols * tile_channels, tile_channels,
- padded_rows, padded_cols, tile_channels / channel_multiplier,
- channel_multiplier);
+ do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row,
+ ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
+ tile_cols * tile_channels, tile_channels, padded_rows, padded_cols,
+ tile_channels / channel_multiplier, channel_multiplier);
}
};
@@ -168,9 +189,8 @@ protected:
std::string m_name{};
public:
- DepthwiseCommon(const DepthwiseArgs &args)
- : m_args(args) {};
- DepthwiseCommon(DepthwiseCommon &) = delete;
+ DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){};
+ DepthwiseCommon(DepthwiseCommon &) = delete;
DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
std::string name() const override
@@ -181,19 +201,18 @@ public:
void set_name(std::string name)
{
// Only allow the name to be set once
- if(m_name.empty())
+ if (m_name.empty())
{
m_name = name;
}
}
- void execute(
- const void *const input,
- const void *const parameters,
- void *const output,
- void *const working_space,
- const unsigned int thread_id,
- const unsigned int n_threads) const override final
+ void execute(const void *const input,
+ const void *const parameters,
+ void *const output,
+ void *const working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads) const override final
{
const size_t ld_input_col = m_args.input_channels;
const size_t ld_input_row = ld_input_col * m_args.input_cols;
@@ -202,56 +221,47 @@ public:
const size_t ld_output_row = ld_output_col * m_args.output_cols;
const size_t ld_output_batch = ld_output_row * m_args.output_rows;
- execute(
- input, ld_input_col, ld_input_row, ld_input_batch,
- parameters, output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, n_threads);
+ execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row,
+ ld_output_batch, working_space, thread_id, n_threads);
}
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *const parameters,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const working_space,
- const unsigned int thread_id,
- const unsigned int n_threads) const override final
+ void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *const parameters,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *const working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads) const override final
{
- execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols,
- m_args.input_channels, m_args.padding,
- input, ld_input_col, ld_input_row, ld_input_batch,
- parameters,
- m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, n_threads);
+ execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input,
+ ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output,
+ ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads);
}
- void execute(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int channels,
- const PaddingValues &padding,
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- unsigned int output_height,
- unsigned int output_width,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const override final
+ void execute(unsigned int batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int channels,
+ const PaddingValues &padding,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const override final
{
// Construct a new set of arguments to reflect that we might have been
// passed different input/output tensors. Dilation is handled at this
@@ -271,38 +281,33 @@ public:
auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
- for(size_t drow = 0; drow < m_args.dilation_rows; drow++)
+ for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
{
size_t start_i;
- std::tie(args.output_rows, args.input_rows, start_i,
- args.padding.top, args.padding.bottom) =
- get_reduced_view_for_dilation(
- output_height, input_height, drow, m_args.dilation_rows,
- m_args.kernel_rows, m_args.stride_rows, padding.top);
+ std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) =
+ get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows,
+ m_args.kernel_rows, m_args.stride_rows, padding.top);
auto input_row = static_cast<const TInput *>(input) + start_i * ld_input_row;
auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
- if(args.output_rows)
+ if (args.output_rows)
{
- for(size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+ for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
{
size_t start_j;
- std::tie(args.output_cols, args.input_cols, start_j,
- args.padding.left, args.padding.right) =
- get_reduced_view_for_dilation(
- output_width, input_width, dcol, m_args.dilation_cols,
- m_args.kernel_cols, m_args.stride_cols, padding.left);
+ std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) =
+ get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols,
+ m_args.kernel_cols, m_args.stride_cols, padding.left);
const TInput *input_col = input_row + start_j * ld_input_col;
TOutput *output_col = output_row + dcol * ld_output_col;
- if(args.output_cols)
+ if (args.output_cols)
{
- this->execute_internal(
- args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters,
- output_col, ld_output_col_d, ld_output_row_d, ld_output_batch,
- working_space, thread_id, n_threads);
+ this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch,
+ parameters, output_col, ld_output_col_d, ld_output_row_d,
+ ld_output_batch, working_space, thread_id, n_threads);
}
}
}
@@ -310,20 +315,19 @@ public:
}
protected:
- virtual void execute_internal(
- const DepthwiseArgs &instance_args,
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
+ virtual void execute_internal(const DepthwiseArgs &instance_args,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
virtual bool uses_premultiply() const
{
diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
index a5db793b3d..5ff848e281 100644
--- a/src/core/NEON/kernels/assembly/depthwise_common.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp
@@ -49,11 +49,7 @@ struct KernelDescription
bool is_default = false;
uint64_t cycle_estimate = 0;
- KernelDescription(
- DepthwiseMethod method,
- std::string name,
- bool is_default,
- uint64_t cycle_estimate)
+ KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate)
: method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate)
{
}
@@ -78,58 +74,51 @@ public:
// pointer the bias vector (which may be nullptr in the case of no bias) and
// a pointer to the array of weights (stored in HWIO order).
virtual void pack_parameters(
- void *buffer,
- const void *biases,
- const void *weights,
- size_t ld_weight_col = 0,
- size_t ld_weight_row = 0) = 0;
+ void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0;
// Determine the amount of working space required
virtual size_t get_working_size(unsigned int n_threads) const = 0;
// Execute the convolution over the specified area of memory.
- virtual void execute(
- const void *input, // Pointer to input tensor
- const void *parameters, // Packed parameters buffer
- void *output,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
-
- virtual void execute(
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
-
- virtual void execute(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int channels,
- const PaddingValues &,
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- unsigned int output_height,
- unsigned int output_width,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
+ virtual void execute(const void *input, // Pointer to input tensor
+ const void *parameters, // Packed parameters buffer
+ void *output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+
+ virtual void execute(const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+
+ virtual void execute(unsigned int batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int channels,
+ const PaddingValues &,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
// To handle a dilation factor of D execute the kernel once for each d in
@@ -145,12 +134,13 @@ public:
// - Number of valid input pixels corresponding to `d`
// - Offset of the first pixel corresponding to `d`
// - Amount of padding in the view for `d`
-std::tuple<size_t, size_t, size_t, size_t, size_t>
-get_reduced_view_for_dilation(
- size_t out_size, size_t in_size,
- size_t d, size_t dilation_factor,
- size_t kernel_size, size_t stride,
- size_t pad_before);
+std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size,
+ size_t in_size,
+ size_t d,
+ size_t dilation_factor,
+ size_t kernel_size,
+ size_t stride,
+ size_t pad_before);
} // namespace depthwise
} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index f1f70cf1d6..045f9f95d3 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp
@@ -68,45 +68,42 @@ public:
virtual size_t get_working_size(unsigned int num_threads) const = 0;
// Execute pooling over the specified area of memory.
- virtual void execute(
- const void *const input,
- void *const output,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute(const void *const input,
+ void *const output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
- virtual void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
- virtual void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &,
- unsigned int output_height,
- unsigned int output_width,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const PaddingValues &,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp
index e8db35c593..89d594298e 100644
--- a/src/core/NEON/kernels/assembly/pooling.hpp
+++ b/src/core/NEON/kernels/assembly/pooling.hpp
@@ -36,9 +36,8 @@ struct PoolingConfig
PoolingMethod method = PoolingMethod::DEFAULT;
std::string filter = "";
- PoolingConfig(PoolingMethod method)
- : method(method) {};
- PoolingConfig() {};
+ PoolingConfig(PoolingMethod method) : method(method){};
+ PoolingConfig(){};
};
struct PoolingArgs
@@ -57,30 +56,40 @@ struct PoolingArgs
const PoolingConfig *config;
- PoolingArgs(
- const CPUInfo *cpu_info,
- PoolingType pool_type,
- const PoolingWindow &window,
- const PoolingStride &stride,
- bool exclude_padding,
- unsigned int n_batches,
- unsigned int input_rows,
- unsigned int input_cols,
- unsigned int n_channels,
- unsigned int output_rows,
- unsigned int output_cols,
- const PaddingValues &padding,
- const PoolingConfig *cfg)
- : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
- n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg)
+ PoolingArgs(const CPUInfo *cpu_info,
+ PoolingType pool_type,
+ const PoolingWindow &window,
+ const PoolingStride &stride,
+ bool exclude_padding,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int n_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ const PaddingValues &padding,
+ const PoolingConfig *cfg)
+ : cpu_info(cpu_info),
+ pool_type(pool_type),
+ pool_window(window),
+ pool_stride(stride),
+ exclude_padding(exclude_padding),
+ n_batches(n_batches),
+ input_rows(input_rows),
+ input_cols(input_cols),
+ n_channels(n_channels),
+ output_rows(output_rows),
+ output_cols(output_cols),
+ padding(padding),
+ config(cfg)
{
// If either of the pooling window dimensions are set to zero, meaning
// "pool everything", then replace with the corresponding input dimension.
- if(pool_window.rows == 0)
+ if (pool_window.rows == 0)
{
pool_window.rows = input_rows;
}
- if(pool_window.cols == 0)
+ if (pool_window.cols == 0)
{
pool_window.cols = input_cols;
}
@@ -100,10 +109,16 @@ struct Requantize32
int32_t per_layer_right_shift = 0;
int32_t per_layer_mul = 0;
- Requantize32(int32_t input_offset, int32_t output_offset,
- int32_t per_layer_left_shift, int32_t per_layer_right_shift,
+ Requantize32(int32_t input_offset,
+ int32_t output_offset,
+ int32_t per_layer_left_shift,
+ int32_t per_layer_right_shift,
int32_t per_layer_mul)
- : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul)
+ : input_offset(input_offset),
+ output_offset(output_offset),
+ per_layer_left_shift(per_layer_left_shift),
+ per_layer_right_shift(per_layer_right_shift),
+ per_layer_mul(per_layer_mul)
{
}
};
@@ -115,105 +130,88 @@ protected:
const PoolingArgs m_args;
public:
- PoolingCommon(const PoolingArgs &args)
- : m_args(args)
+ PoolingCommon(const PoolingArgs &args) : m_args(args)
{
}
- PoolingCommon(PoolingCommon &) = delete;
+ PoolingCommon(PoolingCommon &) = delete;
PoolingCommon &operator=(PoolingCommon &) = delete;
size_t get_working_size(unsigned int) const override = 0;
// Execute pooling over the specified area of memory.
- void execute(
- const void *const input,
- void *const output,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const override
+ void execute(const void *const input,
+ void *const output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
{
- this->execute(
- input,
- m_args.n_channels,
- m_args.n_channels * m_args.input_cols,
- m_args.n_channels * m_args.input_cols * m_args.input_rows,
- output,
- m_args.n_channels,
- m_args.n_channels * m_args.output_cols,
- m_args.n_channels * m_args.output_cols * m_args.output_rows,
- working_space,
- thread_id, num_threads);
+ this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols,
+ m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels,
+ m_args.n_channels * m_args.output_cols,
+ m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id,
+ num_threads);
}
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const override
+ void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
{
- this->execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels,
- input, ld_input_col, ld_input_row, ld_input_batch,
- m_args.padding, m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, num_threads);
+ this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col,
+ ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output,
+ ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads);
}
- void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &padding,
- unsigned int output_height,
- unsigned int output_width,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const override
+ void execute(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const PaddingValues &padding,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
{
- this->execute_internal(
- batches, height, width, channels, padding,
- input, ld_input_col, ld_input_row, ld_input_batch,
- output_height, output_width,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, num_threads);
+ this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row,
+ ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row,
+ ld_output_batch, working_space, thread_id, num_threads);
}
protected:
- virtual void execute_internal(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const PaddingValues &,
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- unsigned int output_height,
- unsigned int output_width,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute_internal(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const PaddingValues &,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
};
template <typename TInput, typename TOutput>
diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp
index 16f26de38a..fb97cf8baf 100644
--- a/src/core/NEON/kernels/assembly/premultiply.hpp
+++ b/src/core/NEON/kernels/assembly/premultiply.hpp
@@ -44,30 +44,27 @@ void do_premultiply(const T *in_ptr,
const unsigned input_channels,
const unsigned int channel_multiplier)
{
- if(sizeof(T) == 4 && channel_multiplier == 6)
+ if (sizeof(T) == 4 && channel_multiplier == 6)
{
- do_premultiply_float_6(
- (const float *)in_ptr, ld_row, ld_col,
- (float *)out_ptr, out_ld_row, out_ld_col,
- tile_rows, tile_cols,
- input_channels);
+ do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col,
+ tile_rows, tile_cols, input_channels);
}
else
{
- for(unsigned int i = 0; i < tile_rows; i++)
+ for (unsigned int i = 0; i < tile_rows; i++)
{
const T *ip2 = in_ptr + i * ld_row;
T *op2 = out_ptr + i * out_ld_row;
- for(unsigned int j = 0; j < tile_cols; j++)
+ for (unsigned int j = 0; j < tile_cols; j++)
{
const T *ip = ip2;
T *op = op2;
- for(unsigned int c = 0; c < input_channels; c++)
+ for (unsigned int c = 0; c < input_channels; c++)
{
T val = *ip;
ip++;
- for(unsigned int r = 0; r < channel_multiplier; r++)
+ for (unsigned int r = 0; r < channel_multiplier; r++)
{
op[r] = val;
}
diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp
index 50290757ec..dbf95d23cd 100644
--- a/src/core/NEON/kernels/assembly/winograd.hpp
+++ b/src/core/NEON/kernels/assembly/winograd.hpp
@@ -45,17 +45,24 @@ struct ConvolutionArgs
Shape2D kernel_shape;
arm_gemm::Activation activation;
- ConvolutionArgs(
- unsigned int n_batches,
- const Shape2D &input_shape,
- unsigned int n_input_channels,
- unsigned int pad_top, unsigned int pad_left,
- const Shape2D &output_shape,
- unsigned int n_output_channels,
- const Shape2D kernel_shape,
- const arm_gemm::Activation &activation = {})
- : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels),
- kernel_shape(kernel_shape), activation(activation)
+ ConvolutionArgs(unsigned int n_batches,
+ const Shape2D &input_shape,
+ unsigned int n_input_channels,
+ unsigned int pad_top,
+ unsigned int pad_left,
+ const Shape2D &output_shape,
+ unsigned int n_output_channels,
+ const Shape2D kernel_shape,
+ const arm_gemm::Activation &activation = {})
+ : n_batches(n_batches),
+ input_shape(input_shape),
+ n_input_channels(n_input_channels),
+ pad_top(pad_top),
+ pad_left(pad_left),
+ output_shape(output_shape),
+ n_output_channels(n_output_channels),
+ kernel_shape(kernel_shape),
+ activation(activation)
{
}
};
@@ -105,23 +112,30 @@ public:
virtual unsigned int get_transformed_tile_rows(void) const = 0;
virtual unsigned int get_transformed_tile_cols(void) const = 0;
- void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
- void *outptr, const WinogradDomainSpec &wds,
- unsigned int thread_id, unsigned int n_threads) const
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_input_channel,
+ void *outptr,
+ const WinogradDomainSpec &wds,
+ unsigned int thread_id,
+ unsigned int n_threads) const
{
- this->execute(
- args, inptr, ld_in_row, ld_in_col, ld_input_channel,
- outptr, wds.weight_ld_matrix, wds.weight_ld_row,
- thread_id, n_threads);
+ this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix,
+ wds.weight_ld_row, thread_id, n_threads);
}
- virtual void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
- void *outptr, size_t ld_out_matrix, size_t ld_out_row,
- unsigned int thread_id, unsigned int n_threads) const = 0;
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_input_channel,
+ void *outptr,
+ size_t ld_out_matrix,
+ size_t ld_out_row,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
} // namespace weight_transform
@@ -136,27 +150,35 @@ public:
virtual unsigned int get_input_rows(void) const = 0;
virtual unsigned int get_input_cols(void) const = 0;
- virtual size_t get_working_space_size(
- const ConvolutionArgs &args,
- unsigned int n_threads) const = 0;
-
- void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
- void *outptr, const WinogradDomainSpec &wds,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const
+ virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ void *outptr,
+ const WinogradDomainSpec &wds,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const
{
- this->execute(
- args, inptr, ld_in_batch, ld_in_row, ld_in_col,
- outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row,
- working_space, thread_id, n_threads);
+ this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix,
+ wds.input_ld_row, working_space, thread_id, n_threads);
}
- virtual void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
- void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_matrix,
+ size_t ld_out_row,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
} // namespace input_transform
@@ -177,31 +199,37 @@ public:
virtual unsigned int get_kernel_rows(void) const = 0;
virtual unsigned int get_kernel_cols(void) const = 0;
- virtual size_t get_working_space_size(
- const ConvolutionArgs &args,
- unsigned int n_threads) const = 0;
-
- void execute(
- const ConvolutionArgs &args,
- const void *inptr, const WinogradDomainSpec &wds,
- const void *bias,
- void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const
+ virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ const WinogradDomainSpec &wds,
+ const void *bias,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_row,
+ size_t ld_out_col,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const
{
- this->execute(
- args,
- inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row,
- bias,
- outptr, ld_out_batch, ld_out_row, ld_out_col,
- working_space, thread_id, n_threads);
+ this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr,
+ ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads);
}
- virtual void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
- const void *bias,
- void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_matrix,
+ size_t ld_in_row,
+ const void *bias,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_row,
+ size_t ld_out_col,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
} // namespace output_transform
@@ -210,7 +238,7 @@ struct WinogradImpl
{
const output_transform::ITransform *output_transform = nullptr;
const weight_transform::ITransform *weight_transform = nullptr;
- const input_transform::ITransform *input_transform = nullptr;
+ const input_transform::ITransform *input_transform = nullptr;
std::unique_ptr<arm_gemm::GemmArgs> gemm_args;
WinogradDomainSpec winograd_spec;
};
@@ -220,15 +248,18 @@ struct WinogradImpl
* Assigns to the pointers in the `dest` struct and returns true or false to
* indicate whether the given problem can be executed or not.
*/
-template <typename TIn, typename TWeight = TIn, typename TOut = TIn, typename TWinogradIn = TIn, typename TWinogradOut = TOut>
-bool get_implementation(
- WinogradImpl &dest, // Destination for the selected implementation
- const CPUInfo *,
- const ConvolutionArgs &,
- int max_threads,
- bool fast_mode,
- const WinogradConfig *,
- const arm_gemm::GemmConfig *);
+template <typename TIn,
+ typename TWeight = TIn,
+ typename TOut = TIn,
+ typename TWinogradIn = TIn,
+ typename TWinogradOut = TOut>
+bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation
+ const CPUInfo *,
+ const ConvolutionArgs &,
+ int max_threads,
+ bool fast_mode,
+ const WinogradConfig *,
+ const arm_gemm::GemmConfig *);
} // namespace winograd
} // namespace arm_conv
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
index ed5254a0a4..e3d9b670b3 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
@@ -24,8 +24,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
@@ -37,12 +38,26 @@ namespace arm_compute
{
namespace
{
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window);
template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
@@ -57,86 +72,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma =
+ (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta =
+ (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
T activation_functor(act_info);
const auto epsilon_vec = wrapper::vdup_n(static_cast<float16_t>(epsilon), ExactTagType{});
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
- // Perform core calculations using vector operations
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- // Conctruct vectors
- const auto mean_vec = wrapper::vloadq(input_mean + x);
- const auto var_vec = wrapper::vloadq(input_var + x);
- const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
- const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
-
- // Calculate denominator
- const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
- // Calculate x bar
- const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
- const auto x_bar = wrapper::vmul(numerator, denominator);
- auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
- // Perform fused activation
- if(act_info.enabled())
+ const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+ // Perform core calculations using vector operations
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- activation_functor(res);
+ // Conctruct vectors
+ const auto mean_vec = wrapper::vloadq(input_mean + x);
+ const auto var_vec = wrapper::vloadq(input_var + x);
+ const auto gamma_vec = (input_gamma != nullptr)
+ ? wrapper::vloadq(input_gamma + x)
+ : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
+ const auto beta_vec = (input_beta != nullptr)
+ ? wrapper::vloadq(input_beta + x)
+ : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
+
+ // Calculate denominator
+ const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+ // Calculate x bar
+ const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+ const auto x_bar = wrapper::vmul(numerator, denominator);
+ auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+ // Perform fused activation
+ if (act_info.enabled())
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ wrapper::vstore(output_ptr + x, res);
}
- // Store results
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- // Conctruct vectors
- const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
- const float16_t beta = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
- const float16_t denominator = sqrt(input_var[x] + epsilon);
- const float16_t numerator = input_ptr[x] - input_mean[x];
- const float16_t x_bar = numerator / denominator;
- float16_t res = beta + x_bar * gamma;
-
- // Perform fused activation
- if(act_info.enabled())
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- activation_functor(res);
+ // Conctruct vectors
+ const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+ const float16_t beta = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+ const float16_t denominator = sqrt(input_var[x] + epsilon);
+ const float16_t numerator = input_ptr[x] - input_mean[x];
+ const float16_t x_bar = numerator / denominator;
+ float16_t res = beta + x_bar * gamma;
+
+ // Perform fused activation
+ if (act_info.enabled())
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ *reinterpret_cast<float16_t *>(output_ptr + x) = res;
}
-
- // Store results
- *reinterpret_cast<float16_t *>(output_ptr + x) = res;
- }
- },
- input, output);
+ },
+ input, output);
}
// Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
- { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+ {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>>},
+ {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>>},
+ {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>>}};
+} // namespace
namespace cpu
{
-void fp16_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_neon_batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
- if(act_info.enabled())
+ if (act_info.enabled())
{
fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
}
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
index d6e22e1843..4e1654ee6b 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
@@ -24,8 +24,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
@@ -36,12 +37,26 @@ namespace arm_compute
{
namespace
{
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window);
template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
@@ -56,86 +71,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma =
+ (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta =
+ (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
T activation_functor(act_info);
const auto epsilon_vec = wrapper::vdup_n(static_cast<float>(epsilon), ExactTagType{});
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
- // Perform core calculations using vector operations
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- // Conctruct vectors
- const auto mean_vec = wrapper::vloadq(input_mean + x);
- const auto var_vec = wrapper::vloadq(input_var + x);
- const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
- const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
-
- // Calculate denominator
- const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
- // Calculate x bar
- const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
- const auto x_bar = wrapper::vmul(numerator, denominator);
- auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
- // Perform fused activation
- if(act_info.enabled())
+ const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+ // Perform core calculations using vector operations
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- activation_functor(res);
+ // Conctruct vectors
+ const auto mean_vec = wrapper::vloadq(input_mean + x);
+ const auto var_vec = wrapper::vloadq(input_var + x);
+ const auto gamma_vec = (input_gamma != nullptr)
+ ? wrapper::vloadq(input_gamma + x)
+ : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
+ const auto beta_vec = (input_beta != nullptr)
+ ? wrapper::vloadq(input_beta + x)
+ : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
+
+ // Calculate denominator
+ const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+ // Calculate x bar
+ const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+ const auto x_bar = wrapper::vmul(numerator, denominator);
+ auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+ // Perform fused activation
+ if (act_info.enabled())
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ wrapper::vstore(output_ptr + x, res);
}
- // Store results
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- // Conctruct vectors
- const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
- const float beta = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
- const float denominator = sqrt(input_var[x] + epsilon);
- const float numerator = input_ptr[x] - input_mean[x];
- const float x_bar = numerator / denominator;
- float res = beta + x_bar * gamma;
-
- // Perform fused activation
- if(act_info.enabled())
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- activation_functor(res);
+ // Conctruct vectors
+ const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+ const float beta = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+ const float denominator = sqrt(input_var[x] + epsilon);
+ const float numerator = input_ptr[x] - input_mean[x];
+ const float x_bar = numerator / denominator;
+ float res = beta + x_bar * gamma;
+
+ // Perform fused activation
+ if (act_info.enabled())
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ *reinterpret_cast<float *>(output_ptr + x) = res;
}
-
- // Store results
- *reinterpret_cast<float *>(output_ptr + x) = res;
- }
- },
- input, output);
+ },
+ input, output);
}
// Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
- { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+ {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>>},
+ {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>>},
+ {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>>}};
+} // namespace
namespace cpu
{
-void fp32_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_neon_batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
- if(act_info.enabled())
+ if (act_info.enabled())
{
fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
}
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
index 98cd9aa7fe..48caaa3e63 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/SVEMath.h"
#include <cmath>
@@ -37,8 +38,15 @@ namespace arm_compute
{
namespace cpu
{
-void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_sve_batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
@@ -49,69 +57,74 @@ void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma =
+ (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta =
+ (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
const auto epsilon_vec = svdup_n_f16(epsilon);
const auto const_1 = svdup_n_f16(1.f);
const auto const_0 = svdup_n_f16(0.f);
const auto va = svdup_n_f16(act_info.a());
const auto vb = svdup_n_f16(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- // Conctruct vectors
- const auto mean_vec = svld1_f16(pg, input_mean + x);
- const auto var_vec = svld1_f16(pg, input_var + x);
- const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
- const auto beta_vec = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
+ const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
- // Calculate denominator
- const auto tmp = svadd_f16_z(pg, var_vec, epsilon_vec);
- auto denominator = svrsqrte_f16(tmp);
- denominator = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
- denominator = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
+ do
+ {
+ // Conctruct vectors
+ const auto mean_vec = svld1_f16(pg, input_mean + x);
+ const auto var_vec = svld1_f16(pg, input_var + x);
+ const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
+ const auto beta_vec = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
- // Calculate x bar
- const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
- const auto x_bar = svmul_f16_z(pg, numerator, denominator);
- auto res = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
+ // Calculate denominator
+ const auto tmp = svadd_f16_z(pg, var_vec, epsilon_vec);
+ auto denominator = svrsqrte_f16(tmp);
+ denominator =
+ svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
+ denominator =
+ svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
- // Perform fused activation
- if(act_info.enabled())
- {
- if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
- {
- res = svmax_f16_z(pg, const_0, res);
- }
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
- }
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ // Calculate x bar
+ const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
+ const auto x_bar = svmul_f16_z(pg, numerator, denominator);
+ auto res = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
+
+ // Perform fused activation
+ if (act_info.enabled())
{
- res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
+ if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ res = svmax_f16_z(pg, const_0, res);
+ }
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
+ }
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
+ }
}
- }
- // Store results
- svst1_f16(pg, output_ptr + x, res);
+ // Store results
+ svst1_f16(pg, output_ptr + x, res);
- x += svcntw();
- pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(svptrue_b16(), pg));
- },
- input, output);
+ x += svcntw();
+ pg = svwhilelt_b16(x, window_end_x);
+ } while (svptest_any(svptrue_b16(), pg));
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
index 952ab320bf..df4fbfe607 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/SVEMath.h"
#include <cmath>
@@ -37,8 +38,15 @@ namespace arm_compute
{
namespace cpu
{
-void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_sve_batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
@@ -49,69 +57,74 @@ void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma =
+ (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta =
+ (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
const auto epsilon_vec = svdup_n_f32(epsilon);
const auto const_1 = svdup_n_f32(1.f);
const auto const_0 = svdup_n_f32(0.f);
const auto va = svdup_n_f32(act_info.a());
const auto vb = svdup_n_f32(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b32(x, window_end_x);
- do
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- // Conctruct vectors
- const auto mean_vec = svld1_f32(pg, input_mean + x);
- const auto var_vec = svld1_f32(pg, input_var + x);
- const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
- const auto beta_vec = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
+ const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(output.ptr());
- // Calculate denominator
- const auto tmp = svadd_f32_z(pg, var_vec, epsilon_vec);
- auto denominator = svrsqrte_f32(tmp);
- denominator = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
- denominator = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b32(x, window_end_x);
+ do
+ {
+ // Conctruct vectors
+ const auto mean_vec = svld1_f32(pg, input_mean + x);
+ const auto var_vec = svld1_f32(pg, input_var + x);
+ const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
+ const auto beta_vec = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
- // Calculate x bar
- const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
- const auto x_bar = svmul_f32_z(pg, numerator, denominator);
- auto res = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
+ // Calculate denominator
+ const auto tmp = svadd_f32_z(pg, var_vec, epsilon_vec);
+ auto denominator = svrsqrte_f32(tmp);
+ denominator =
+ svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
+ denominator =
+ svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
- // Perform fused activation
- if(act_info.enabled())
- {
- if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
- {
- res = svmax_f32_z(pg, const_0, res);
- }
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
- }
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ // Calculate x bar
+ const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
+ const auto x_bar = svmul_f32_z(pg, numerator, denominator);
+ auto res = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
+
+ // Perform fused activation
+ if (act_info.enabled())
{
- res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
+ if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ res = svmax_f32_z(pg, const_0, res);
+ }
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
+ }
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
+ }
}
- }
- // Store results
- svst1_f32(pg, output_ptr + x, res);
+ // Store results
+ svst1_f32(pg, output_ptr + x, res);
- x += svcntw();
- pg = svwhilelt_b32(x, window_end_x);
- }
- while(svptest_any(svptrue_b32(), pg));
- },
- input, output);
+ x += svcntw();
+ pg = svwhilelt_b32(x, window_end_x);
+ } while (svptest_any(svptrue_b32(), pg));
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h
index 8e0ea36f5a..cbf540bd71 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/list.h
+++ b/src/core/NEON/kernels/batchnormalization/impl/list.h
@@ -28,9 +28,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name) \
- void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, \
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name) \
+ void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, \
+ const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_neon_batch_normalization);
DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization);
diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index 3900ea62cd..95cdc8f2f9 100644
--- a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -158,8 +159,7 @@ struct logistic
*
* @param[in] act_info Activation layer information.
*/
- explicit logistic(ActivationLayerInfo act_info)
- : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
+ explicit logistic(ActivationLayerInfo act_info) : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
{
ARM_COMPUTE_UNUSED(act_info);
}
@@ -198,8 +198,7 @@ struct relu
*
* @param[in] act_info Activation layer information.
*/
- explicit relu(ActivationLayerInfo act_info)
- : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
+ explicit relu(ActivationLayerInfo act_info) : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
{
ARM_COMPUTE_UNUSED(act_info);
}
diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
index ac196d9dbb..50fff04cad 100644
--- a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
+++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
@@ -25,6 +25,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IMultiImage.h"
#include "arm_compute/core/Utils.h"
+
#include "src/core/NEON/NEMath.h"
#include <arm_neon.h>
@@ -50,8 +51,12 @@ constexpr float rgb2u8_red_coef = 0.2126f;
constexpr float rgb2u8_green_coef = 0.7152f;
constexpr float rgb2u8_blue_coef = 0.0722f;
-inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor,
- const float rcoef, const float gcoef, const float bcoef)
+inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor,
+ const float32x4_t &gcolor,
+ const float32x4_t &bcolor,
+ const float rcoef,
+ const float gcoef,
+ const float bcoef)
{
float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef);
greyscale = vmlaq_n_f32(greyscale, gcolor, gcoef);
@@ -86,8 +91,12 @@ inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out)
arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out);
}
-inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
- float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
+inline void rgb_to_yuv_calculation(const float32x4_t &rvec,
+ const float32x4_t &gvec,
+ const float32x4_t &bvec,
+ float32x4_t &yvec,
+ float32x4_t &uvec,
+ float32x4_t &vvec)
{
/*
Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
@@ -110,8 +119,12 @@ inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &g
vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
}
-inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
- float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
+inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val,
+ float32x4_t uvec_val,
+ const float32x4_t &yyvec_val,
+ float32x4_t vvec_val,
+ unsigned char *output_ptr,
+ const bool alpha)
{
float32x4x3_t rgb1, rgb2;
@@ -126,8 +139,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve
// b = 1.8556f*f_u + 0.0000f*f_v;
const auto red = vmulq_n_f32(vvec_val, red_coef_bt709);
const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709);
- const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
- vmulq_n_f32(vvec_val, green_coef2_bt709));
+ const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), vmulq_n_f32(vvec_val, green_coef2_bt709));
// Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
// the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
@@ -144,7 +156,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve
uint8x8x3_t u8_rgb;
arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
- if(!alpha)
+ if (!alpha)
{
vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
@@ -177,7 +189,7 @@ inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
{
uint8x16x3_t rgb;
- if(alpha)
+ if (alpha)
{
const auto tmp = vld4q_u8(ptr);
rgb.val[0] = tmp.val[0];
@@ -206,12 +218,12 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto
float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
- for(auto i = 0; i < 4; ++i)
+ for (auto i = 0; i < 4; ++i)
{
- rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
- fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
- rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
- fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
+ rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], fyvec_top.val[i], fuvec_top.val[i],
+ fvvec_top.val[i]);
+ rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], fyvec_bottom.val[i],
+ fuvec_bottom.val[i], fvvec_bottom.val[i]);
}
arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]);
@@ -222,9 +234,14 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto
arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]);
}
-inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
- const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
- unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_nv12(const uint8x16_t &rvec_top,
+ const uint8x16_t &gvec_top,
+ const uint8x16_t &bvec_top,
+ const uint8x16_t &rvec_bottom,
+ const uint8x16_t &gvec_bottom,
+ const uint8x16_t &bvec_bottom,
+ unsigned char *const __restrict out_y_top,
+ unsigned char *const __restrict out_y_bottom,
unsigned char *const __restrict out_uv)
{
uint8x16x3_t vec_top, vec_bottom;
@@ -252,9 +269,14 @@ inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec
vst2_u8(out_uv, uvvec);
}
-inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
- const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
- unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top,
+ const uint8x16_t &gvec_top,
+ const uint8x16_t &bvec_top,
+ const uint8x16_t &rvec_bottom,
+ const uint8x16_t &gvec_bottom,
+ const uint8x16_t &bvec_bottom,
+ unsigned char *const __restrict out_y_top,
+ unsigned char *const __restrict out_y_bottom,
unsigned char *const __restrict out_u,
unsigned char *const __restrict out_v)
{
@@ -273,14 +295,16 @@ inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec
const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
- const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
- vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
+ const auto uvvec =
+ vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
vst1_u8(out_u, vget_low_u8(uvvec));
vst1_u8(out_v, vget_high_u8(uvvec));
}
-inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
+inline void store_rgb_to_yuv4(const uint8x16_t &rvec,
+ const uint8x16_t &gvec,
+ const uint8x16_t &bvec,
unsigned char *const __restrict out_y,
unsigned char *const __restrict out_u,
unsigned char *const __restrict out_v)
@@ -291,10 +315,9 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec);
float32x4x4_t fyvec, fuvec, fvvec;
- for(auto i = 0; i < 4; ++i)
+ for (auto i = 0; i < 4; ++i)
{
- rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
- fyvec.val[i], fuvec.val[i], fvvec.val[i]);
+ rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], fyvec.val[i], fuvec.val[i], fvvec.val[i]);
}
uint8x16_t yvec, uvec, vvec;
@@ -307,7 +330,7 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
vst1q_u8(out_v, vvec);
}
#endif /* DOXYGEN_SKIP_THIS */
-}
+} // namespace
namespace arm_compute
{
@@ -329,17 +352,19 @@ void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict out
Iterator in(input_ptr, win);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta1 = vld3q_u8(in.ptr());
- uint8x16x4_t ta2;
- ta2.val[0] = ta1.val[0];
- ta2.val[1] = ta1.val[1];
- ta2.val[2] = ta1.val[2];
- ta2.val[3] = vdupq_n_u8(255);
- vst4q_u8(out.ptr(), ta2);
- },
- in, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta1 = vld3q_u8(in.ptr());
+ uint8x16x4_t ta2;
+ ta2.val[0] = ta1.val[0];
+ ta2.val[1] = ta1.val[1];
+ ta2.val[2] = ta1.val[2];
+ ta2.val[3] = vdupq_n_u8(255);
+ vst4q_u8(out.ptr(), ta2);
+ },
+ in, out);
}
/** Convert RGB to U8.
@@ -360,14 +385,16 @@ void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict outpu
Iterator in(input_ptr, win);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta1 = vld3q_u8(in.ptr());
- uint8x16_t ta2;
- rgb_to_u8_conversion(ta1, ta2);
- vst1q_u8(out.ptr(), ta2);
- },
- in, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta1 = vld3q_u8(in.ptr());
+ uint8x16_t ta2;
+ rgb_to_u8_conversion(ta1, ta2);
+ vst1q_u8(out.ptr(), ta2);
+ },
+ in, out);
}
/** Convert RGBX to RGB.
@@ -388,16 +415,18 @@ void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win
Iterator in(input_ptr, win);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta1 = vld4q_u8(in.ptr());
- uint8x16x3_t ta2;
- ta2.val[0] = ta1.val[0];
- ta2.val[1] = ta1.val[1];
- ta2.val[2] = ta1.val[2];
- vst3q_u8(out.ptr(), ta2);
- },
- in, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta1 = vld4q_u8(in.ptr());
+ uint8x16x3_t ta2;
+ ta2.val[0] = ta1.val[0];
+ ta2.val[1] = ta1.val[1];
+ ta2.val[2] = ta1.val[2];
+ vst3q_u8(out.ptr(), ta2);
+ },
+ in, out);
}
/** Convert YUYV to RGB.
@@ -422,26 +451,32 @@ void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict out
Iterator in(input_ptr, win);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta = vld4q_u8(in.ptr());
- //ta.val[0] = Y0 Y2 Y4 Y6 ...
- //ta.val[1] = U0 U2 U4 U6 ...
- //ta.val[2] = Y1 Y3 Y5 Y7 ...
- //ta.val[3] = V0 V2 V4 V7 ...
-
- // Convert the uint8x16x4_t to float32x4x4_t
- const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
- const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
- const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
- const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
-
- yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
- },
- in, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta = vld4q_u8(in.ptr());
+ //ta.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta.val[1] = U0 U2 U4 U6 ...
+ //ta.val[2] = Y1 Y3 Y5 Y7 ...
+ //ta.val[3] = V0 V2 V4 V7 ...
+
+ // Convert the uint8x16x4_t to float32x4x4_t
+ const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
+ const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
+ const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
+ const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
+
+ yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size,
+ alpha);
+ yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size,
+ alpha);
+ yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size,
+ alpha);
+ yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size,
+ alpha);
+ },
+ in, out);
}
/** Convert NV12 to RGB.
@@ -475,35 +510,45 @@ void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict out
Iterator in_uv(input_ptr->plane(1), win_uv);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_y_top = vld2q_u8(in_y.ptr());
- const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
- const auto ta_uv = vld2q_u8(in_uv.ptr());
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_uv.val[0] = U0 U2 U4 U6 ...
- //ta_uv.val[1] = V0 V2 V4 V6 ...
-
- // Convert the uint8x16x4_t to float32x4x4_t
- float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
- float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
- float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
- float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
- float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
- float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
-
- yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
- yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
- },
- in_y, in_uv, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_y_top = vld2q_u8(in_y.ptr());
+ const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+ const auto ta_uv = vld2q_u8(in_uv.ptr());
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_uv.val[0] = U0 U2 U4 U6 ...
+ //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+ // Convert the uint8x16x4_t to float32x4x4_t
+ float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+ float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+ float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+ float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+ float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
+ float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
+
+ yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+ out.ptr() + 0 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+ out.ptr() + 1 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+ out.ptr() + 2 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+ out.ptr() + 3 * element_size, alpha);
+
+ yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+ out.ptr() + out_stride + 0 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+ out.ptr() + out_stride + 1 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+ out.ptr() + out_stride + 2 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+ out.ptr() + out_stride + 3 * element_size, alpha);
+ },
+ in_y, in_uv, out);
}
/** Convert IYUV to RGB.
@@ -537,59 +582,71 @@ void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict out
Iterator in_v(input_ptr->plane(2), win_uv);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto *y_top_ptr = in_y.ptr();
- const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
- const auto *u_ptr = in_u.ptr();
- const auto *v_ptr = in_v.ptr();
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto *y_top_ptr = in_y.ptr();
+ const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
+ const auto *u_ptr = in_u.ptr();
+ const auto *v_ptr = in_v.ptr();
// Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation
#if defined(__arch64__)
- const auto ta0_y_top = vld1q_u8(y_top_ptr);
- const auto ta1_y_top = vld1q_u8(y_top_ptr + 16);
- const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
- const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
- const auto ta_u = vld1q_u8(u_ptr);
- const auto ta_v = vld1q_u8(v_ptr);
-
- // Convert the uint8x16x4_t to float32x4x4_t
- float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
- float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
- float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
- float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
- float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
- float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+ const auto ta0_y_top = vld1q_u8(y_top_ptr);
+ const auto ta1_y_top = vld1q_u8(y_top_ptr + 16);
+ const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
+ const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
+ const auto ta_u = vld1q_u8(u_ptr);
+ const auto ta_v = vld1q_u8(v_ptr);
+
+ // Convert the uint8x16x4_t to float32x4x4_t
+ float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
+ float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
+ float32x4x4_t yvec_bottom =
+ arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
+ float32x4x4_t yyvec_bottom =
+ arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
+ float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+ float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
#else /* defined(__arch64__) */
- const auto ta_y_top = vld2q_u8(y_top_ptr);
- const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
- const auto ta_u = vld1q_u8(u_ptr);
- const auto ta_v = vld1q_u8(v_ptr);
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_u.val[0] = U0 U2 U4 U6 ...
- //ta_v.val[0] = V0 V2 V4 V6 ...
-
- // Convert the uint8x16x4_t to float32x4x4_t
- float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
- float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
- float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
- float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
- float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
- float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+ const auto ta_y_top = vld2q_u8(y_top_ptr);
+ const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
+ const auto ta_u = vld1q_u8(u_ptr);
+ const auto ta_v = vld1q_u8(v_ptr);
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_u.val[0] = U0 U2 U4 U6 ...
+ //ta_v.val[0] = V0 V2 V4 V6 ...
+
+ // Convert the uint8x16x4_t to float32x4x4_t
+ float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+ float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+ float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+ float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+ float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+ float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
#endif /* defined(__arch64__) */
- yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
- yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
- },
- in_y, in_u, in_v, out);
+ yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+ out.ptr() + 0 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+ out.ptr() + 1 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+ out.ptr() + 2 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+ out.ptr() + 3 * element_size, alpha);
+
+ yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+ out.ptr() + out_stride + 0 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+ out.ptr() + out_stride + 1 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+ out.ptr() + out_stride + 2 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+ out.ptr() + out_stride + 3 * element_size, alpha);
+ },
+ in_y, in_u, in_v, out);
}
/** Convert YUYV to NV12.
@@ -621,31 +678,33 @@ void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict ou
Iterator out_y(output_ptr->plane(0), win);
Iterator out_uv(output_ptr->plane(1), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_top = vld4q_u8(in.ptr());
- const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
- //ta.val[0] = Y0 Y2 Y4 Y6 ...
- //ta.val[1] = U0 U2 U4 U6 ...
- //ta.val[2] = Y1 Y3 Y5 Y7 ...
- //ta.val[3] = V0 V2 V4 V7 ...
-
- uint8x16x2_t yvec;
- yvec.val[0] = ta_top.val[0 + shift];
- yvec.val[1] = ta_top.val[2 + shift];
- vst2q_u8(out_y.ptr(), yvec);
-
- uint8x16x2_t yyvec;
- yyvec.val[0] = ta_bottom.val[0 + shift];
- yyvec.val[1] = ta_bottom.val[2 + shift];
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
- uint8x16x2_t uvvec;
- uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
- uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
- vst2q_u8(out_uv.ptr(), uvvec);
- },
- in, out_y, out_uv);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_top = vld4q_u8(in.ptr());
+ const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+ //ta.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta.val[1] = U0 U2 U4 U6 ...
+ //ta.val[2] = Y1 Y3 Y5 Y7 ...
+ //ta.val[3] = V0 V2 V4 V7 ...
+
+ uint8x16x2_t yvec;
+ yvec.val[0] = ta_top.val[0 + shift];
+ yvec.val[1] = ta_top.val[2 + shift];
+ vst2q_u8(out_y.ptr(), yvec);
+
+ uint8x16x2_t yyvec;
+ yyvec.val[0] = ta_bottom.val[0 + shift];
+ yyvec.val[1] = ta_bottom.val[2 + shift];
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+ uint8x16x2_t uvvec;
+ uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+ uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+ vst2q_u8(out_uv.ptr(), uvvec);
+ },
+ in, out_y, out_uv);
}
/** Convert IYUV to NV12.
@@ -676,23 +735,25 @@ void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict ou
Iterator out_y(output_ptr->plane(0), win);
Iterator out_uv(output_ptr->plane(1), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_y_top = vld2q_u8(in_y.ptr());
- const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
- uint8x16x2_t ta_uv;
- ta_uv.val[0] = vld1q_u8(in_u.ptr());
- ta_uv.val[1] = vld1q_u8(in_v.ptr());
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_uv.val[0] = U0 U2 U4 U6 ...
- //ta_uv.val[1] = V0 V2 V4 V6 ...
-
- vst2q_u8(out_y.ptr(), ta_y_top);
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
- vst2q_u8(out_uv.ptr(), ta_uv);
- },
- in_y, in_u, in_v, out_y, out_uv);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_y_top = vld2q_u8(in_y.ptr());
+ const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+ uint8x16x2_t ta_uv;
+ ta_uv.val[0] = vld1q_u8(in_u.ptr());
+ ta_uv.val[1] = vld1q_u8(in_v.ptr());
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_uv.val[0] = U0 U2 U4 U6 ...
+ //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+ vst2q_u8(out_y.ptr(), ta_y_top);
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+ vst2q_u8(out_uv.ptr(), ta_uv);
+ },
+ in_y, in_u, in_v, out_y, out_uv);
}
/** Convert NV12 to IYUV.
@@ -726,22 +787,24 @@ void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict ou
Iterator out_u(output_ptr->plane(1), win_uv);
Iterator out_v(output_ptr->plane(2), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_y_top = vld2q_u8(in_y.ptr());
- const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
- const auto ta_uv = vld2q_u8(in_uv.ptr());
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_uv.val[0] = U0 U2 U4 U6 ...
- //ta_uv.val[1] = V0 V2 V4 V6 ...
-
- vst2q_u8(out_y.ptr(), ta_y_top);
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
- vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
- vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
- },
- in_y, in_uv, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_y_top = vld2q_u8(in_y.ptr());
+ const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+ const auto ta_uv = vld2q_u8(in_uv.ptr());
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_uv.val[0] = U0 U2 U4 U6 ...
+ //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+ vst2q_u8(out_y.ptr(), ta_y_top);
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+ vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
+ vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
+ },
+ in_y, in_uv, out_y, out_u, out_v);
}
/** Convert YUYV to IYUV.
@@ -774,34 +837,36 @@ void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict ou
Iterator out_u(output_ptr->plane(1), win_uv);
Iterator out_v(output_ptr->plane(2), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_top = vld4q_u8(in.ptr());
- const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
- //ta.val[0] = Y0 Y2 Y4 Y6 ...
- //ta.val[1] = U0 U2 U4 U6 ...
- //ta.val[2] = Y1 Y3 Y5 Y7 ...
- //ta.val[3] = V0 V2 V4 V7 ...
-
- uint8x16x2_t yvec;
- yvec.val[0] = ta_top.val[0 + shift];
- yvec.val[1] = ta_top.val[2 + shift];
- vst2q_u8(out_y.ptr(), yvec);
-
- uint8x16x2_t yyvec;
- yyvec.val[0] = ta_bottom.val[0 + shift];
- yyvec.val[1] = ta_bottom.val[2 + shift];
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
- uint8x16_t uvec;
- uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
- vst1q_u8(out_u.ptr(), uvec);
-
- uint8x16_t vvec;
- vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
- vst1q_u8(out_v.ptr(), vvec);
- },
- in, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_top = vld4q_u8(in.ptr());
+ const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+ //ta.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta.val[1] = U0 U2 U4 U6 ...
+ //ta.val[2] = Y1 Y3 Y5 Y7 ...
+ //ta.val[3] = V0 V2 V4 V7 ...
+
+ uint8x16x2_t yvec;
+ yvec.val[0] = ta_top.val[0 + shift];
+ yvec.val[1] = ta_top.val[2 + shift];
+ vst2q_u8(out_y.ptr(), yvec);
+
+ uint8x16x2_t yyvec;
+ yyvec.val[0] = ta_bottom.val[0 + shift];
+ yyvec.val[1] = ta_bottom.val[2 + shift];
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+ uint8x16_t uvec;
+ uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+ vst1q_u8(out_u.ptr(), uvec);
+
+ uint8x16_t vvec;
+ vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+ vst1q_u8(out_v.ptr(), vvec);
+ },
+ in, out_y, out_u, out_v);
}
/** Convert NV12 to YUV4.
@@ -835,32 +900,34 @@ void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict ou
Iterator out_u(output_ptr->plane(1), win);
Iterator out_v(output_ptr->plane(2), win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_y_top = vld2q_u8(in_y.ptr());
- const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
- const auto ta_uv = vld2q_u8(in_uv.ptr());
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_uv.val[0] = U0 U2 U4 U6 ...
- //ta_uv.val[1] = V0 V2 V4 V6 ...
-
- vst2q_u8(out_y.ptr(), ta_y_top);
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
- uint8x16x2_t uvec;
- uvec.val[0] = ta_uv.val[0 + shift];
- uvec.val[1] = ta_uv.val[0 + shift];
- vst2q_u8(out_u.ptr(), uvec);
- vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
- uint8x16x2_t vvec;
- vvec.val[0] = ta_uv.val[1 - shift];
- vvec.val[1] = ta_uv.val[1 - shift];
- vst2q_u8(out_v.ptr(), vvec);
- vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
- },
- in_y, in_uv, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_y_top = vld2q_u8(in_y.ptr());
+ const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+ const auto ta_uv = vld2q_u8(in_uv.ptr());
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_uv.val[0] = U0 U2 U4 U6 ...
+ //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+ vst2q_u8(out_y.ptr(), ta_y_top);
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+ uint8x16x2_t uvec;
+ uvec.val[0] = ta_uv.val[0 + shift];
+ uvec.val[1] = ta_uv.val[0 + shift];
+ vst2q_u8(out_u.ptr(), uvec);
+ vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+ uint8x16x2_t vvec;
+ vvec.val[0] = ta_uv.val[1 - shift];
+ vvec.val[1] = ta_uv.val[1 - shift];
+ vst2q_u8(out_v.ptr(), vvec);
+ vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+ },
+ in_y, in_uv, out_y, out_u, out_v);
}
/** Convert IYUV to YUV4.
@@ -892,33 +959,35 @@ void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict ou
Iterator out_u(output_ptr->plane(1), win);
Iterator out_v(output_ptr->plane(2), win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_y_top = vld2q_u8(in_y.ptr());
- const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
- const auto ta_u = vld1q_u8(in_u.ptr());
- const auto ta_v = vld1q_u8(in_v.ptr());
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_u = U0 U2 U4 U6 ...
- //ta_v = V0 V2 V4 V6 ...
-
- vst2q_u8(out_y.ptr(), ta_y_top);
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
- uint8x16x2_t uvec;
- uvec.val[0] = ta_u;
- uvec.val[1] = ta_u;
- vst2q_u8(out_u.ptr(), uvec);
- vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
- uint8x16x2_t vvec;
- vvec.val[0] = ta_v;
- vvec.val[1] = ta_v;
- vst2q_u8(out_v.ptr(), vvec);
- vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
- },
- in_y, in_u, in_v, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_y_top = vld2q_u8(in_y.ptr());
+ const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+ const auto ta_u = vld1q_u8(in_u.ptr());
+ const auto ta_v = vld1q_u8(in_v.ptr());
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_u = U0 U2 U4 U6 ...
+ //ta_v = V0 V2 V4 V6 ...
+
+ vst2q_u8(out_y.ptr(), ta_y_top);
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+ uint8x16x2_t uvec;
+ uvec.val[0] = ta_u;
+ uvec.val[1] = ta_u;
+ vst2q_u8(out_u.ptr(), uvec);
+ vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+ uint8x16x2_t vvec;
+ vvec.val[0] = ta_v;
+ vvec.val[1] = ta_v;
+ vst2q_u8(out_v.ptr(), vvec);
+ vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+ },
+ in_y, in_u, in_v, out_y, out_u, out_v);
}
/** Convert RGB to NV12.
@@ -948,20 +1017,21 @@ void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict out
Iterator out_y(output_ptr->plane(0), win);
Iterator out_uv(output_ptr->plane(1), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
- const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
- //ta_rgb.val[0] = R0 R1 R2 R3 ...
- //ta_rgb.val[1] = G0 G1 G2 G3 ...
- //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
- store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
- ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
- out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
- out_uv.ptr());
- },
- in, out_y, out_uv);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
+ const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+ //ta_rgb.val[0] = R0 R1 R2 R3 ...
+ //ta_rgb.val[1] = G0 G1 G2 G3 ...
+ //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+ store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+ ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+ out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_uv.ptr());
+ },
+ in, out_y, out_uv);
}
/** Convert RGB to IYUV.
@@ -992,20 +1062,22 @@ void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict out
Iterator out_u(output_ptr->plane(1), win_uv);
Iterator out_v(output_ptr->plane(2), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
- const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
- //ta_rgb.val[0] = R0 R1 R2 R3 ...
- //ta_rgb.val[1] = G0 G1 G2 G3 ...
- //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
- store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
- ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
- out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
- out_u.ptr(), out_v.ptr());
- },
- in, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
+ const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+ //ta_rgb.val[0] = R0 R1 R2 R3 ...
+ //ta_rgb.val[1] = G0 G1 G2 G3 ...
+ //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+ store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+ ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+ out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_u.ptr(),
+ out_v.ptr());
+ },
+ in, out_y, out_u, out_v);
}
/** Convert RGB to YUV4.
@@ -1030,16 +1102,17 @@ void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict out
Iterator out_u(output_ptr->plane(1), win);
Iterator out_v(output_ptr->plane(2), win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_rgb = load_rgb(in.ptr(), alpha);
- //ta_rgb.val[0] = R0 R1 R2 R3 ...
- //ta_rgb.val[1] = G0 G1 G2 G3 ...
- //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
- store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
- out_y.ptr(), out_u.ptr(), out_v.ptr());
- },
- in, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_rgb = load_rgb(in.ptr(), alpha);
+ //ta_rgb.val[0] = R0 R1 R2 R3 ...
+ //ta_rgb.val[1] = G0 G1 G2 G3 ...
+ //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+ store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], out_y.ptr(), out_u.ptr(), out_v.ptr());
+ },
+ in, out_y, out_u, out_v);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
index 96defbc9c9..4b1eb079b2 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
@@ -33,56 +33,32 @@ namespace detail
{
inline float32x4x3_t load_matrix_row(const float *ptr)
{
- const float32x4x3_t r =
- {
- {
- vld1q_dup_f32(ptr),
- vld1q_dup_f32(1 + ptr),
- vld1q_dup_f32(2 + ptr)
- }
- };
+ const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
return r;
}
template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
+float32x4x2_t convolve_3x3(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2);
template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<1>(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2)
{
- const float32x4x3_t vtop =
- {
- {
- vld1q_f32(in_top),
- vld1q_f32(in_top + 4),
- vld1q_f32(in_top + 8)
- }
- };
- const float32x4x3_t vmid =
- {
- {
- vld1q_f32(in_mid),
- vld1q_f32(in_mid + 4),
- vld1q_f32(in_mid + 8)
- }
- };
- const float32x4x3_t vlow =
- {
- {
- vld1q_f32(in_low),
- vld1q_f32(in_low + 4),
- vld1q_f32(in_low + 8)
- }
- };
- float32x4x2_t out =
- {
- {
- vmulq_f32(vtop.val[0], m0.val[0]),
- vmulq_f32(vtop.val[1], m0.val[0])
- }
- };
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+ const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+ const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+ const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+ float32x4x2_t out = {{vmulq_f32(vtop.val[0], m0.val[0]), vmulq_f32(vtop.val[1], m0.val[0])}};
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
@@ -106,7 +82,12 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c
}
template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<2>(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2)
{
float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
@@ -116,7 +97,12 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c
}
template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<3>(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2)
{
float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
@@ -165,6 +151,6 @@ int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteratio
{
return num_elems_written_per_iteration * 3;
}
-}
+} // namespace detail
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index 7ba52a16b7..fd1ee54597 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -45,14 +45,7 @@ namespace detail
inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
{
ARM_COMPUTE_UNUSED(weights_offset);
- const float32x4x3_t r =
- {
- {
- vld1q_dup_f32(ptr),
- vld1q_dup_f32(1 + ptr),
- vld1q_dup_f32(2 + ptr)
- }
- };
+ const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
return r;
}
@@ -63,21 +56,16 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
*
* @return The loaded matrix.
*/
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0)
{
const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
/* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
- int32x4x3_t r =
- {
- {
- vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
- vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
- vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
- }
- };
+ int32x4x3_t r = {{vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
+ vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
+ vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))}};
return r;
}
@@ -245,36 +233,23 @@ inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values
* @param[in] input_offset (Optional) Input quantization offset.
*
*/
-inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
- const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- const size_t dilation_x, int input_offset)
+inline float32x4_t single_convolve_3x3_dilation(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2,
+ const size_t dilation_x,
+ int input_offset)
{
ARM_COMPUTE_UNUSED(input_offset);
- const float32x4x3_t vtop =
- {
- {
- vld1q_f32(in_top),
- vld1q_f32(in_top + dilation_x),
- vld1q_f32(in_top + 2 * dilation_x)
- }
- };
- const float32x4x3_t vmid =
- {
- {
- vld1q_f32(in_mid),
- vld1q_f32(in_mid + dilation_x),
- vld1q_f32(in_mid + 2 * dilation_x)
- }
- };
- const float32x4x3_t vlow =
- {
- {
- vld1q_f32(in_low),
- vld1q_f32(in_low + dilation_x),
- vld1q_f32(in_low + 2 * dilation_x)
- }
- };
+ const float32x4x3_t vtop = {
+ {vld1q_f32(in_top), vld1q_f32(in_top + dilation_x), vld1q_f32(in_top + 2 * dilation_x)}};
+ const float32x4x3_t vmid = {
+ {vld1q_f32(in_mid), vld1q_f32(in_mid + dilation_x), vld1q_f32(in_mid + 2 * dilation_x)}};
+ const float32x4x3_t vlow = {
+ {vld1q_f32(in_low), vld1q_f32(in_low + dilation_x), vld1q_f32(in_low + 2 * dilation_x)}};
float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
out = vmlaq_f32(out, vtop.val[1], m0.val[1]);
out = vmlaq_f32(out, vtop.val[2], m0.val[2]);
@@ -303,26 +278,28 @@ inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float
* @param[in] input_offset (Optional) Input quantization offset.
*
*/
-inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
- const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- const size_t dilation_x, unsigned int stridex, int input_offset = 0)
+inline float32x4x2_t convolve_3x3_dilation(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2,
+ const size_t dilation_x,
+ unsigned int stridex,
+ int input_offset = 0)
{
ARM_COMPUTE_ERROR_ON(stridex > 3);
- float32x4x2_t out =
- {
- {
- single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
- single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
- }
- };
+ float32x4x2_t out = {
+ {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+ single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
- if(stridex == 2)
+ if (stridex == 2)
{
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
}
- else if(stridex == 3)
+ else if (stridex == 3)
{
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
}
@@ -344,26 +321,32 @@ inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_
*
*/
template <bool accumulate>
-void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
- const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- unsigned int stridex, int input_offset = 0);
+void convolve_3x3(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ float *out_ptr,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2,
+ unsigned int stridex,
+ int input_offset = 0);
template <bool accumulate>
-inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
- const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- unsigned int stridex, int input_offset)
+inline void convolve_3x3(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ float *out_ptr,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2,
+ unsigned int stridex,
+ int input_offset)
{
ARM_COMPUTE_UNUSED(input_offset);
ARM_COMPUTE_ERROR_ON(stridex > 3);
- float32x4x2_t out =
- {
- {
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f)
- }
- };
- if(stridex == 2)
+ float32x4x2_t out = {{vdupq_n_f32(0.f), vdupq_n_f32(0.f)}};
+ if (stridex == 2)
{
const float32x4x2_t vtop = vld2q_f32(in_top);
const float32x4x2_t vmid = vld2q_f32(in_mid);
@@ -389,32 +372,11 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
}
else
{
- const float32x4x3_t vtop =
- {
- {
- vld1q_f32(in_top),
- vld1q_f32(in_top + 4),
- vld1q_f32(in_top + 8)
- }
- };
- const float32x4x3_t vmid =
- {
- {
- vld1q_f32(in_mid),
- vld1q_f32(in_mid + 4),
- vld1q_f32(in_mid + 8)
- }
- };
- const float32x4x3_t vlow =
- {
- {
- vld1q_f32(in_low),
- vld1q_f32(in_low + 4),
- vld1q_f32(in_low + 8)
- }
- };
- out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
- out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
+ const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+ const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+ const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+ out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
+ out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
@@ -438,7 +400,7 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
- if(stridex == 3)
+ if (stridex == 3)
{
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -462,65 +424,43 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
* @param[in] input_offset Input quantization offset.
*
*/
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low,
- const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
- size_t dilation_x, int32_t input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4_t single_convolve_3x3_dilation(const T *in_top,
+ const T *in_mid,
+ const T *in_low,
+ const int32x4x3_t &m0,
+ const int32x4x3_t &m1,
+ const int32x4x3_t &m2,
+ size_t dilation_x,
+ int32_t input_offset)
{
using VectorType = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
- const VectorType vtop =
- {
- {
- wrapper::vload(in_top),
- wrapper::vload(in_top + dilation_x),
- wrapper::vload(in_top + 2 * dilation_x)
- }
- };
- const VectorType vmid =
- {
- {
- wrapper::vload(in_mid),
- wrapper::vload(in_mid + dilation_x),
- wrapper::vload(in_mid + 2 * dilation_x)
- }
- };
- const VectorType vlow =
- {
- {
- wrapper::vload(in_low),
- wrapper::vload(in_low + dilation_x),
- wrapper::vload(in_low + 2 * dilation_x)
- }
- };
-
- const int32x4x3_t vtop_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
- }
- };
- const int32x4x3_t vmid_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
- }
- };
- const int32x4x3_t vlow_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
- }
- };
+ const VectorType vtop = {
+ {wrapper::vload(in_top), wrapper::vload(in_top + dilation_x), wrapper::vload(in_top + 2 * dilation_x)}};
+ const VectorType vmid = {
+ {wrapper::vload(in_mid), wrapper::vload(in_mid + dilation_x), wrapper::vload(in_mid + 2 * dilation_x)}};
+ const VectorType vlow = {
+ {wrapper::vload(in_low), wrapper::vload(in_low + dilation_x), wrapper::vload(in_low + 2 * dilation_x)}};
+
+ const int32x4x3_t vtop_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
+ }};
+ const int32x4x3_t vmid_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
+ }};
+ const int32x4x3_t vlow_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
+ }};
int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
out = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
@@ -550,26 +490,29 @@ inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid,
* @param[in] input_offset Input quantization offset.
*
*/
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
- const size_t dilation_x, unsigned int stridex, int input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4x2_t convolve_3x3_dilation(const T *in_top,
+ const T *in_mid,
+ const T *in_low,
+ const int32x4x3_t &m0,
+ const int32x4x3_t &m1,
+ const int32x4x3_t &m2,
+ const size_t dilation_x,
+ unsigned int stridex,
+ int input_offset)
{
ARM_COMPUTE_ERROR_ON(stridex > 3);
- int32x4x2_t out =
- {
- {
- single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
- single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
- }
- };
+ int32x4x2_t out = {
+ {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+ single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
- if(stridex == 2)
+ if (stridex == 2)
{
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
}
- else if(stridex == 3)
+ else if (stridex == 3)
{
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
}
@@ -589,10 +532,19 @@ inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const
* @param[in] input_offset Input quantization offset.
*
*/
-template < bool accumulate, typename T1, typename T2, ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value) >
-void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr,
- const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
- unsigned int stridex, int32_t input_offset)
+template <bool accumulate,
+ typename T1,
+ typename T2,
+ ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value)>
+void convolve_3x3(const T1 *in_top,
+ const T1 *in_mid,
+ const T1 *in_low,
+ T2 *out_ptr,
+ const int32x4x3_t &m0,
+ const int32x4x3_t &m1,
+ const int32x4x3_t &m2,
+ unsigned int stridex,
+ int32_t input_offset)
{
ARM_COMPUTE_ERROR_ON(stridex > 3);
using VectorType = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
@@ -600,60 +552,30 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
- const VectorType vtop =
- {
- {
- wrapper::vload(in_top),
- wrapper::vload(in_top + 8)
- }
- };
- const VectorType vmid =
- {
- {
- wrapper::vload(in_mid),
- wrapper::vload(in_mid + 8)
- }
- };
- const VectorType vlow =
- {
- {
- wrapper::vload(in_low),
- wrapper::vload(in_low + 8)
- }
- };
-
- const int32x4x3_t vtop_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
- }
- };
- const int32x4x3_t vmid_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
- }
- };
- const int32x4x3_t vlow_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
- }
- };
-
- int32x4x2_t out
- {
- {
- wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
- wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
- }
- };
+ const VectorType vtop = {{wrapper::vload(in_top), wrapper::vload(in_top + 8)}};
+ const VectorType vmid = {{wrapper::vload(in_mid), wrapper::vload(in_mid + 8)}};
+ const VectorType vlow = {{wrapper::vload(in_low), wrapper::vload(in_low + 8)}};
+
+ const int32x4x3_t vtop_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+ }};
+ const int32x4x3_t vmid_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+ }};
+ const int32x4x3_t vlow_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+ }};
+
+ int32x4x2_t out{{
+ wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+ wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+ }};
// 0
out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
@@ -681,11 +603,11 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
- if(stridex == 1)
+ if (stridex == 1)
{
accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
}
- else if(stridex == 2)
+ else if (stridex == 2)
{
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
@@ -693,7 +615,7 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
}
- else if(stridex == 3)
+ else if (stridex == 3)
{
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -712,14 +634,7 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset =
ARM_COMPUTE_UNUSED(weights_offset);
/* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
- const float16x8x3_t r =
- {
- {
- vld1q_dup_f16(ptr),
- vld1q_dup_f16(1 + ptr),
- vld1q_dup_f16(2 + ptr)
- }
- };
+ const float16x8x3_t r = {{vld1q_dup_f16(ptr), vld1q_dup_f16(1 + ptr), vld1q_dup_f16(2 + ptr)}};
return r;
}
@@ -735,35 +650,22 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset =
* @param[in] input_offset (Optional)Input quantization offset.
*
*/
-inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
- const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
- const size_t dilation_x, int input_offset = 0)
+inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top,
+ const float16_t *in_mid,
+ const float16_t *in_low,
+ const float16x8x3_t &m0,
+ const float16x8x3_t &m1,
+ const float16x8x3_t &m2,
+ const size_t dilation_x,
+ int input_offset = 0)
{
ARM_COMPUTE_UNUSED(input_offset);
- const float16x8x3_t vtop =
- {
- {
- vld1q_f16(in_top),
- vld1q_f16(in_top + dilation_x),
- vld1q_f16(in_top + 2 * dilation_x)
- }
- };
- const float16x8x3_t vmid =
- {
- {
- vld1q_f16(in_mid),
- vld1q_f16(in_mid + dilation_x),
- vld1q_f16(in_mid + 2 * dilation_x)
- }
- };
- const float16x8x3_t vlow =
- {
- {
- vld1q_f16(in_low),
- vld1q_f16(in_low + dilation_x),
- vld1q_f16(in_low + 2 * dilation_x)
- }
- };
+ const float16x8x3_t vtop = {
+ {vld1q_f16(in_top), vld1q_f16(in_top + dilation_x), vld1q_f16(in_top + 2 * dilation_x)}};
+ const float16x8x3_t vmid = {
+ {vld1q_f16(in_mid), vld1q_f16(in_mid + dilation_x), vld1q_f16(in_mid + 2 * dilation_x)}};
+ const float16x8x3_t vlow = {
+ {vld1q_f16(in_low), vld1q_f16(in_low + dilation_x), vld1q_f16(in_low + 2 * dilation_x)}};
float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]);
out = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1]));
out = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2]));
@@ -792,19 +694,21 @@ inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const f
* @param[in] input_offset (Optional) Input quantization offset.
*
*/
-inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
- const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
- const size_t dilation_x, unsigned int stridex, int input_offset = 0)
-{
- float16x8x2_t out =
- {
- {
- single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
- single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)
- }
- };
-
- if(stridex == 2)
+inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top,
+ const float16_t *in_mid,
+ const float16_t *in_low,
+ const float16x8x3_t &m0,
+ const float16x8x3_t &m1,
+ const float16x8x3_t &m2,
+ const size_t dilation_x,
+ unsigned int stridex,
+ int input_offset = 0)
+{
+ float16x8x2_t out = {
+ {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+ single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)}};
+
+ if (stridex == 2)
{
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);
@@ -814,7 +718,7 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);
}
- else if(stridex == 3)
+ else if (stridex == 3)
{
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -838,20 +742,20 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1
*
*/
template <bool accumulate>
-inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr,
- const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
- unsigned int stridex, int input_offset = 0)
+inline void convolve_3x3(const float16_t *in_top,
+ const float16_t *in_mid,
+ const float16_t *in_low,
+ float16_t *out_ptr,
+ const float16x8x3_t &m0,
+ const float16x8x3_t &m1,
+ const float16x8x3_t &m2,
+ unsigned int stridex,
+ int input_offset = 0)
{
ARM_COMPUTE_UNUSED(input_offset);
- float16x8x2_t out =
- {
- {
- vdupq_n_f16(0),
- vdupq_n_f16(0)
- }
- };
- if(stridex == 2)
+ float16x8x2_t out = {{vdupq_n_f16(0), vdupq_n_f16(0)}};
+ if (stridex == 2)
{
const float16x8x2_t vtop = vld2q_f16(in_top);
const float16x8x2_t vmid = vld2q_f16(in_mid);
@@ -877,32 +781,11 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
}
else
{
- const float16x8x3_t vtop =
- {
- {
- vld1q_f16(in_top),
- vld1q_f16(in_top + 8),
- vld1q_f16(in_top + 16)
- }
- };
- const float16x8x3_t vmid =
- {
- {
- vld1q_f16(in_mid),
- vld1q_f16(in_mid + 8),
- vld1q_f16(in_mid + 16)
- }
- };
- const float16x8x3_t vlow =
- {
- {
- vld1q_f16(in_low),
- vld1q_f16(in_low + 8),
- vld1q_f16(in_low + 16)
- }
- };
- out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
- out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);
+ const float16x8x3_t vtop = {{vld1q_f16(in_top), vld1q_f16(in_top + 8), vld1q_f16(in_top + 16)}};
+ const float16x8x3_t vmid = {{vld1q_f16(in_mid), vld1q_f16(in_mid + 8), vld1q_f16(in_mid + 16)}};
+ const float16x8x3_t vlow = {{vld1q_f16(in_low), vld1q_f16(in_low + 8), vld1q_f16(in_low + 16)}};
+ out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
+ out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);
out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
@@ -921,7 +804,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
- if(stridex == 3)
+ if (stridex == 3)
{
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -946,7 +829,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
*/
inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex)
{
- switch(stridex)
+ switch (stridex)
{
case 1:
return num_elems_written_per_iteration;
@@ -959,6 +842,6 @@ inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iter
return 0;
}
}
-}
+} // namespace detail
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */
diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h
index 1c77a9e9f0..381de2284a 100644
--- a/src/core/NEON/wrapper/intrinsics/cvt.h
+++ b/src/core/NEON/wrapper/intrinsics/cvt.h
@@ -30,12 +30,11 @@ namespace arm_compute
{
namespace wrapper
{
-#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2) \
- template <typename T> \
- inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type \
- vcvt(const vtype &a) \
- { \
- return prefix##_##postfix1##_##postfix2(a); \
+#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2) \
+ template <typename T> \
+ inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type vcvt(const vtype &a) \
+ { \
+ return prefix##_##postfix1##_##postfix2(a); \
}
VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32)
@@ -46,12 +45,11 @@ VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16)
#undef VCVT_TO_F32_IMPL
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2) \
- template <typename T> \
- inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type \
- vcvt(const vtype &a) \
- { \
- return prefix##_##postfix1##_##postfix2(a); \
+#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2) \
+ template <typename T> \
+ inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type vcvt(const vtype &a) \
+ { \
+ return prefix##_##postfix1##_##postfix2(a); \
}
VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
@@ -59,14 +57,14 @@ VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t >::type
+inline typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t>::type
vcvt(const float32x4_t &a)
{
return vcvtq_u32_f32(a);
}
template <typename T>
-inline typename std::enable_if < std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t >::type
+inline typename std::enable_if<std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t>::type
vcvt(const float32x4_t &a)
{
return vcvtq_s32_f32(a);
@@ -74,15 +72,13 @@ vcvt(const float32x4_t &a)
#ifdef __aarch64__
template <typename T>
-inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type
-vcvta(const float32x4_t &a)
+inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type vcvta(const float32x4_t &a)
{
return vcvtaq_u32_f32(a);
}
template <typename T>
-inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type
-vcvta(const float32x4_t &a)
+inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type vcvta(const float32x4_t &a)
{
return vcvtaq_s32_f32(a);
}
@@ -96,14 +92,13 @@ vcvta(const float32x4_t &a)
*/
inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr)
{
- __asm __volatile(
- "ldp q0, q1, [%[inptr]]\n"
- ".inst 0xea16800\n" // BFCVTN v0, v0
- ".inst 0x4ea16820\n" // BFCVTN2 v0, v1
- "str q0, [%[outptr]]\n"
- : [inptr] "+r"(inptr)
- : [outptr] "r"(outptr)
- : "v0", "v1", "memory");
+ __asm __volatile("ldp q0, q1, [%[inptr]]\n"
+ ".inst 0xea16800\n" // BFCVTN v0, v0
+ ".inst 0x4ea16820\n" // BFCVTN2 v0, v1
+ "str q0, [%[outptr]]\n"
+ : [inptr] "+r"(inptr)
+ : [outptr] "r"(outptr)
+ : "v0", "v1", "memory");
}
#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
diff --git a/src/core/NEON/wrapper/intrinsics/div.h b/src/core/NEON/wrapper/intrinsics/div.h
index 265f30d33b..ece991a5b0 100644
--- a/src/core/NEON/wrapper/intrinsics/div.h
+++ b/src/core/NEON/wrapper/intrinsics/div.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_WRAPPER_DIV_H
#include "src/core/NEON/NEMath.h"
+
#include <arm_neon.h>
namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/erf.h b/src/core/NEON/wrapper/intrinsics/erf.h
index e2207648e5..0e34462b96 100644
--- a/src/core/NEON/wrapper/intrinsics/erf.h
+++ b/src/core/NEON/wrapper/intrinsics/erf.h
@@ -26,6 +26,7 @@
#define ARM_COMPUTE_WRAPPER_ERF_H
#include "src/core/NEON/NEMath.h"
+
#include <arm_neon.h>
namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/exp.h b/src/core/NEON/wrapper/intrinsics/exp.h
index c2a6970967..f44577b926 100644
--- a/src/core/NEON/wrapper/intrinsics/exp.h
+++ b/src/core/NEON/wrapper/intrinsics/exp.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_WRAPPER_EXP_H
#include "src/core/NEON/NEMath.h"
+
#include <arm_neon.h>
namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/getlane.h b/src/core/NEON/wrapper/intrinsics/getlane.h
index 2052751612..ae813bb2fa 100644
--- a/src/core/NEON/wrapper/intrinsics/getlane.h
+++ b/src/core/NEON/wrapper/intrinsics/getlane.h
@@ -33,7 +33,7 @@ namespace wrapper
#define VGETLANE_IMPL_8(stype, vtype, postfix) \
inline stype vgetlane(const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vget_lane_##postfix(vector, 0); \
@@ -59,7 +59,7 @@ namespace wrapper
#define VGETLANE_IMPL_4(stype, vtype, postfix) \
inline stype vgetlane(const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vget_lane_##postfix(vector, 0); \
@@ -77,7 +77,7 @@ namespace wrapper
#define VGETLANE_IMPL_2(stype, vtype, postfix) \
inline stype vgetlane(const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vget_lane_##postfix(vector, 0); \
@@ -102,7 +102,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
#define VGETQLANE_IMPL_16(stype, vtype, postfix) \
inline stype vgetlane(const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vgetq_lane_##postfix(vector, 0); \
@@ -144,7 +144,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
#define VGETQLANE_IMPL_8(stype, vtype, postfix) \
inline stype vgetlane(const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vgetq_lane_##postfix(vector, 0); \
@@ -170,7 +170,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
#define VGETQLANE_IMPL_4(stype, vtype, postfix) \
inline stype vgetlane(const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vgetq_lane_##postfix(vector, 0); \
@@ -188,7 +188,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
#define VGETQLANE_IMPL_2(stype, vtype, postfix) \
inline stype vgetlane(const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vgetq_lane_##postfix(vector, 0); \
diff --git a/src/core/NEON/wrapper/intrinsics/inv.h b/src/core/NEON/wrapper/intrinsics/inv.h
index de398b0403..e443be679b 100644
--- a/src/core/NEON/wrapper/intrinsics/inv.h
+++ b/src/core/NEON/wrapper/intrinsics/inv.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_WRAPPER_INV_H
#include "src/core/NEON/NEMath.h"
+
#include <arm_neon.h>
namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/invsqrt.h b/src/core/NEON/wrapper/intrinsics/invsqrt.h
index 2343efa8f8..257b445cc7 100644
--- a/src/core/NEON/wrapper/intrinsics/invsqrt.h
+++ b/src/core/NEON/wrapper/intrinsics/invsqrt.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_WRAPPER_INVSQRT_H
#include "src/core/NEON/NEMath.h"
+
#include <arm_neon.h>
namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/log.h b/src/core/NEON/wrapper/intrinsics/log.h
index 357a77ca78..d091407edb 100644
--- a/src/core/NEON/wrapper/intrinsics/log.h
+++ b/src/core/NEON/wrapper/intrinsics/log.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_WRAPPER_LOG_H
#include "src/core/NEON/NEMath.h"
+
#include <arm_neon.h>
namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/pow.h b/src/core/NEON/wrapper/intrinsics/pow.h
index 61f834ed23..dfd6ccc358 100644
--- a/src/core/NEON/wrapper/intrinsics/pow.h
+++ b/src/core/NEON/wrapper/intrinsics/pow.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_WRAPPER_POW_H
#include "src/core/NEON/NEMath.h"
+
#include <arm_neon.h>
namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/qmov.h b/src/core/NEON/wrapper/intrinsics/qmov.h
index 167f3cf43b..9a0a23a241 100644
--- a/src/core/NEON/wrapper/intrinsics/qmov.h
+++ b/src/core/NEON/wrapper/intrinsics/qmov.h
@@ -31,15 +31,13 @@ namespace arm_compute
namespace wrapper
{
template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type
-vqmov(const int16x8_t &a)
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type vqmov(const int16x8_t &a)
{
return vqmovun_s16(a);
}
template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type
-vqmov(const int16x8_t &a)
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type vqmov(const int16x8_t &a)
{
return vqmovn_s16(a);
}
diff --git a/src/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h
index cf00a4aceb..c2c4f720d2 100644
--- a/src/core/NEON/wrapper/intrinsics/reinterpret.h
+++ b/src/core/NEON/wrapper/intrinsics/reinterpret.h
@@ -35,7 +35,7 @@ namespace wrapper
{ \
return prefix##_##postfix1##_##postfix2(a); \
} \
- \
+ \
inline ptype vreinterpret(const ptype &a) \
{ \
return a; \
diff --git a/src/core/NEON/wrapper/intrinsics/round.h b/src/core/NEON/wrapper/intrinsics/round.h
index d23feb6b42..7789aab770 100644
--- a/src/core/NEON/wrapper/intrinsics/round.h
+++ b/src/core/NEON/wrapper/intrinsics/round.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_WRAPPER_ROUND_H
#include "src/core/NEON/NEMath.h"
+
#include <arm_neon.h>
namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/setlane.h b/src/core/NEON/wrapper/intrinsics/setlane.h
index 197eedacb5..259b8eaf90 100644
--- a/src/core/NEON/wrapper/intrinsics/setlane.h
+++ b/src/core/NEON/wrapper/intrinsics/setlane.h
@@ -33,7 +33,7 @@ namespace wrapper
#define VSETLANE_IMPL_8(stype, atype, vtype, postfix) \
inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vset_lane_##postfix(value, vector, 0); \
@@ -59,7 +59,7 @@ namespace wrapper
#define VSETLANE_IMPL_4(stype, atype, vtype, postfix) \
inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vset_lane_##postfix(value, vector, 0); \
@@ -77,7 +77,7 @@ namespace wrapper
#define VSETLANE_IMPL_2(stype, atype, vtype, postfix) \
inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vset_lane_##postfix(value, vector, 0); \
@@ -102,7 +102,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
#define VSETQLANE_IMPL_16(stype, atype, vtype, postfix) \
inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vsetq_lane_##postfix(value, vector, 0); \
@@ -144,7 +144,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
#define VSETQLANE_IMPL_8(stype, atype, vtype, postfix) \
inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vsetq_lane_##postfix(value, vector, 0); \
@@ -170,7 +170,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
#define VSETQLANE_IMPL_4(stype, atype, vtype, postfix) \
inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
{ \
- switch(lane) \
+ switch (lane) \
{ \
case 0: \
return vsetq_lane_##postfix(value, vector, 0); \
diff --git a/src/core/NEON/wrapper/intrinsics/shr.h b/src/core/NEON/wrapper/intrinsics/shr.h
index 73ca9c56c6..6ccb9cdf92 100644
--- a/src/core/NEON/wrapper/intrinsics/shr.h
+++ b/src/core/NEON/wrapper/intrinsics/shr.h
@@ -75,7 +75,7 @@ VQRSHRN_SCALAR_IMPL(uint32_t, uint64_t, vqrshrnd_n, u64)
{ \
return prefix_signed##_##postfix(a, b); \
} \
- \
+ \
template <int b, typename T> \
inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \
vqrshrn_ex(const vtype &a) \
@@ -128,7 +128,7 @@ VSHRQ_SCALAR_IMPL(int32_t, vshrd_n, s64)
{ \
return prefix_signed##_##postfix(a, b); \
} \
- \
+ \
template <int b, typename T> \
inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \
vqrshrn_ex(const vtype &a) \
diff --git a/src/core/NEON/wrapper/intrinsics/sin.h b/src/core/NEON/wrapper/intrinsics/sin.h
index 03c2813a32..d24fdfa816 100644
--- a/src/core/NEON/wrapper/intrinsics/sin.h
+++ b/src/core/NEON/wrapper/intrinsics/sin.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_WRAPPER_SIN_H
#include "src/core/NEON/NEMath.h"
+
#include <arm_neon.h>
namespace arm_compute
@@ -54,4 +55,4 @@ VSIN_IMPL_INT(int32x4_t, vsinq, s32)
#undef vsub_IMPL
} // namespace wrapper
} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SUB_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svcnt.h b/src/core/NEON/wrapper/intrinsics/svcnt.h
index e530e7c83f..c4652504b4 100644
--- a/src/core/NEON/wrapper/intrinsics/svcnt.h
+++ b/src/core/NEON/wrapper/intrinsics/svcnt.h
@@ -30,7 +30,7 @@ namespace arm_compute
namespace wrapper
{
template <size_t element_size>
-inline uint64_t svcnt_size();
+inline uint64_t svcnt_size();
template <>
inline uint64_t svcnt_size<64>()
@@ -65,4 +65,4 @@ inline uint64_t svcnt()
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */ \ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svcvt.h b/src/core/NEON/wrapper/intrinsics/svcvt.h
index 746b004d7d..00ef7b7eb3 100644
--- a/src/core/NEON/wrapper/intrinsics/svcvt.h
+++ b/src/core/NEON/wrapper/intrinsics/svcvt.h
@@ -29,11 +29,12 @@ namespace arm_compute
{
namespace wrapper
{
-#define SVCVT_Z_TO_F32_IMPL(vtype) \
- template <typename T> \
- inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t pg, const vtype &a) \
- { \
- return svcvt_f32_z(pg, a); \
+#define SVCVT_Z_TO_F32_IMPL(vtype) \
+ template <typename T> \
+ inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t pg, \
+ const vtype &a) \
+ { \
+ return svcvt_f32_z(pg, a); \
}
SVCVT_Z_TO_F32_IMPL(svuint32_t)
@@ -42,11 +43,12 @@ SVCVT_Z_TO_F32_IMPL(svfloat16_t)
#undef SVCVT_Z_TO_F32_IMPL
-#define SVCVT_Z_TO_F16_IMPL(vtype) \
- template <typename T> \
- inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t pg, const vtype &a) \
- { \
- return svcvt_f16_z(pg, a); \
+#define SVCVT_Z_TO_F16_IMPL(vtype) \
+ template <typename T> \
+ inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t pg, \
+ const vtype &a) \
+ { \
+ return svcvt_f16_z(pg, a); \
}
SVCVT_Z_TO_F16_IMPL(svuint32_t)
@@ -55,11 +57,12 @@ SVCVT_Z_TO_F16_IMPL(svfloat32_t)
#undef SVCVT_Z_TO_F16_IMPL
-#define SVCVT_Z_TO_S32_IMPL(vtype) \
- template <typename T> \
- inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t pg, const vtype &a) \
- { \
- return svcvt_s32_z(pg, a); \
+#define SVCVT_Z_TO_S32_IMPL(vtype) \
+ template <typename T> \
+ inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t pg, \
+ const vtype &a) \
+ { \
+ return svcvt_s32_z(pg, a); \
}
SVCVT_Z_TO_S32_IMPL(svfloat16_t)
@@ -71,4 +74,4 @@ SVCVT_Z_TO_S32_IMPL(svfloat32_t)
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */ \ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svexp.h b/src/core/NEON/wrapper/intrinsics/svexp.h
index d6ce9a77d1..1e8bce3960 100644
--- a/src/core/NEON/wrapper/intrinsics/svexp.h
+++ b/src/core/NEON/wrapper/intrinsics/svexp.h
@@ -26,6 +26,7 @@
#if defined(__ARM_FEATURE_SVE)
#include "src/core/NEON/SVEMath.h"
+
#include <arm_sve.h>
namespace arm_compute
@@ -46,4 +47,4 @@ SVEXP_IMPL(svfloat16_t, f16)
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */ \ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svlog.h b/src/core/NEON/wrapper/intrinsics/svlog.h
index 5b505ae1e3..b4630e20ed 100644
--- a/src/core/NEON/wrapper/intrinsics/svlog.h
+++ b/src/core/NEON/wrapper/intrinsics/svlog.h
@@ -25,6 +25,7 @@
#define SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H
#if defined(__ARM_FEATURE_SVE)
#include "src/core/NEON/SVEMath.h"
+
#include <arm_sve.h>
namespace arm_compute
@@ -44,4 +45,4 @@ SVLOG_IMPL(svfloat16_t, f16)
} // namespace wrapper
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */ \ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svptrue.h b/src/core/NEON/wrapper/intrinsics/svptrue.h
index 53407e5301..6ed00bccbf 100644
--- a/src/core/NEON/wrapper/intrinsics/svptrue.h
+++ b/src/core/NEON/wrapper/intrinsics/svptrue.h
@@ -30,7 +30,7 @@ namespace arm_compute
namespace wrapper
{
template <size_t element_size>
-inline svbool_t svptrue_size();
+inline svbool_t svptrue_size();
template <>
inline svbool_t svptrue_size<64>()
@@ -65,4 +65,4 @@ svbool_t svptrue()
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */ \ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svwhilelt.h b/src/core/NEON/wrapper/intrinsics/svwhilelt.h
index ef58217dc4..f0f84a9508 100644
--- a/src/core/NEON/wrapper/intrinsics/svwhilelt.h
+++ b/src/core/NEON/wrapper/intrinsics/svwhilelt.h
@@ -32,7 +32,7 @@ namespace wrapper
#define SVWHILELT_IMPL(type) \
template <size_t element_size> \
inline svbool_t svwhilelt_size(type a, type b); \
- \
+ \
template <> \
inline svbool_t svwhilelt_size<64>(type a, type b) \
{ \
@@ -70,4 +70,4 @@ inline svbool_t svwhilelt(IndexType a, IndexType b)
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */ \ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/tanh.h b/src/core/NEON/wrapper/intrinsics/tanh.h
index daeaf19997..e74f0e86fe 100644
--- a/src/core/NEON/wrapper/intrinsics/tanh.h
+++ b/src/core/NEON/wrapper/intrinsics/tanh.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_WRAPPER_TANH_H
#include "src/core/NEON/NEMath.h"
+
#include <arm_neon.h>
namespace arm_compute
diff --git a/src/core/NEON/wrapper/scalar/add.h b/src/core/NEON/wrapper/scalar/add.h
index 642d9261f3..2ec88869e3 100644
--- a/src/core/NEON/wrapper/scalar/add.h
+++ b/src/core/NEON/wrapper/scalar/add.h
@@ -32,22 +32,22 @@ namespace wrapper
{
inline uint8_t add_sat(const uint8_t &a, const uint8_t &b)
{
- const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
- const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+ const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0};
+ const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0};
return vget_lane_u8(vqadd_u8(va, vb), 0);
}
inline int16_t add_sat(const int16_t &a, const int16_t &b)
{
- const int16x4_t va = { a, 0, 0, 0 };
- const int16x4_t vb = { b, 0, 0, 0 };
+ const int16x4_t va = {a, 0, 0, 0};
+ const int16x4_t vb = {b, 0, 0, 0};
return vget_lane_s16(vqadd_s16(va, vb), 0);
}
inline int32_t add_sat(const int32_t &a, const int32_t &b)
{
- const int32x2_t va = { a, 0 };
- const int32x2_t vb = { b, 0 };
+ const int32x2_t va = {a, 0};
+ const int32x2_t vb = {b, 0};
return vget_lane_s32(vqadd_s32(va, vb), 0);
}
diff --git a/src/core/NEON/wrapper/scalar/sub.h b/src/core/NEON/wrapper/scalar/sub.h
index 1fe51d75fc..00de7d867f 100644
--- a/src/core/NEON/wrapper/scalar/sub.h
+++ b/src/core/NEON/wrapper/scalar/sub.h
@@ -32,22 +32,22 @@ namespace wrapper
{
inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b)
{
- const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
- const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+ const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0};
+ const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0};
return vget_lane_u8(vqsub_u8(va, vb), 0);
}
inline int16_t sub_sat(const int16_t &a, const int16_t &b)
{
- const int16x4_t va = { a, 0, 0, 0 };
- const int16x4_t vb = { b, 0, 0, 0 };
+ const int16x4_t va = {a, 0, 0, 0};
+ const int16x4_t vb = {b, 0, 0, 0};
return vget_lane_s16(vqsub_s16(va, vb), 0);
}
inline int32_t sub_sat(const int32_t &a, const int32_t &b)
{
- const int32x2_t va = { a, 0 };
- const int32x2_t vb = { b, 0 };
+ const int32x2_t va = {a, 0};
+ const int32x2_t vb = {b, 0};
return vget_lane_s32(vqsub_s32(va, vb), 0);
}
diff --git a/src/core/NEON/wrapper/svtraits.h b/src/core/NEON/wrapper/svtraits.h
index 5ccd0ba8f1..330d272752 100644
--- a/src/core/NEON/wrapper/svtraits.h
+++ b/src/core/NEON/wrapper/svtraits.h
@@ -25,6 +25,7 @@
#define SRC_CORE_NEON_WRAPPER_SVTRAITS_H
#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "src/core/NEON/SVEMath.h"
+
#include <arm_sve.h>
namespace arm_compute
diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp
index 99858e2a98..62ce335815 100644
--- a/src/core/Rounding.cpp
+++ b/src/core/Rounding.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Rounding.h"
#include "arm_compute/core/Error.h"
+
#include "support/ToolchainSupport.h"
#include <cmath>
@@ -36,7 +37,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy)
{
using namespace std;
int rounded = 0;
- switch(rounding_policy)
+ switch (rounding_policy)
{
case RoundingPolicy::TO_ZERO:
{
@@ -51,9 +52,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy)
case RoundingPolicy::TO_NEAREST_EVEN:
{
#ifdef __aarch64__
- asm("fcvtns %x[res], %s[value]"
- : [res] "=r"(rounded)
- : [value] "w"(x));
+ asm("fcvtns %x[res], %s[value]" : [res] "=r"(rounded) : [value] "w"(x));
#else // __aarch64__
ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported.");
#endif // __aarch64__
diff --git a/src/core/Size2D.cpp b/src/core/Size2D.cpp
index 6eb46e56af..69b2651520 100644
--- a/src/core/Size2D.cpp
+++ b/src/core/Size2D.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/core/Size2D.h"
+
#include "support/StringSupport.h"
namespace arm_compute
@@ -30,4 +31,4 @@ std::string Size2D::to_string() const
{
return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height);
}
-}
+} // namespace arm_compute
diff --git a/src/core/Size3D.cpp b/src/core/Size3D.cpp
index 3ee9fb8e5c..b56a99acd7 100644
--- a/src/core/Size3D.cpp
+++ b/src/core/Size3D.cpp
@@ -22,12 +22,14 @@
* SOFTWARE.
*/
#include "arm_compute/core/Size3D.h"
+
#include "support/StringSupport.h"
namespace arm_compute
{
std::string Size3D::to_string() const
{
- return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") + support::cpp11::to_string(depth);
+ return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") +
+ support::cpp11::to_string(depth);
}
-} \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index 723b6bc016..8012c3d721 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp
@@ -42,10 +42,10 @@ namespace
TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coordinates coords)
{
// Extend shape
- for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+ for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
{
int dimension_extend = coords[i] + static_cast<int>(shape[i]);
- if((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
+ if ((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
{
parent_shape.set(i, static_cast<size_t>(dimension_extend));
}
@@ -56,23 +56,35 @@ TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coo
} // namespace
SubTensorInfo::SubTensorInfo()
- : _parent(nullptr), _tensor_shape(), _dims_state(), _coords(), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(false), _lock_paddings(false)
+ : _parent(nullptr),
+ _tensor_shape(),
+ _dims_state(),
+ _coords(),
+ _valid_region{Coordinates(), _tensor_shape},
+ _extend_parent(false),
+ _lock_paddings(false)
{
}
SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent)
- : _parent(parent), _tensor_shape(tensor_shape), _dims_state(), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(extend_parent), _lock_paddings(false)
+ : _parent(parent),
+ _tensor_shape(tensor_shape),
+ _dims_state(),
+ _coords(coords),
+ _valid_region{Coordinates(), _tensor_shape},
+ _extend_parent(extend_parent),
+ _lock_paddings(false)
{
ARM_COMPUTE_ERROR_ON(parent == nullptr);
// Check if subtensor is valid if parent is configured
- if(parent->tensor_shape().total_size() != 0 && !_extend_parent)
+ if (parent->tensor_shape().total_size() != 0 && !_extend_parent)
{
ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
}
// Initialize valid region
- _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+ _valid_region = ValidRegion{Coordinates(), _tensor_shape};
}
std::unique_ptr<ITensorInfo> SubTensorInfo::clone() const
@@ -91,17 +103,17 @@ ITensorInfo &SubTensorInfo::set_tensor_shape(const TensorShape &shape)
ARM_COMPUTE_ERROR_ON(_parent == nullptr);
// Check if subtensor is valid if parent is configured
- if(_parent->tensor_shape().total_size() != 0 && !_extend_parent)
+ if (_parent->tensor_shape().total_size() != 0 && !_extend_parent)
{
ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
- _valid_region = ValidRegion{ _coords, shape };
+ _valid_region = ValidRegion{_coords, shape};
}
- else if(_extend_parent) // Extend parent shape, configure if specified
+ else if (_extend_parent) // Extend parent shape, configure if specified
{
ARM_COMPUTE_ERROR_ON((_parent->data_type() == DataType::UNKNOWN) && (_parent->format() == Format::UNKNOWN));
TensorShape parent_extended_shape = extend_parent_shape(_parent->tensor_shape(), shape, _coords);
_parent->set_tensor_shape(parent_extended_shape);
- _parent->set_valid_region(ValidRegion{ Coordinates(), parent_extended_shape });
+ _parent->set_valid_region(ValidRegion{Coordinates(), parent_extended_shape});
}
_tensor_shape = shape;
return *this;
@@ -133,11 +145,11 @@ bool SubTensorInfo::extend_padding(const PaddingSize &padding)
ARM_COMPUTE_ERROR_ON(_parent->total_size() == 0);
// Check that you do not extend padding on sub-tensors unless XY shape matches parent tensor
- if(!_extend_parent && (padding.left || padding.right))
+ if (!_extend_parent && (padding.left || padding.right))
{
ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().x() != tensor_shape().x());
}
- if(!_extend_parent && (padding.top || padding.bottom))
+ if (!_extend_parent && (padding.top || padding.bottom))
{
ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().y() != tensor_shape().y());
}
@@ -153,7 +165,7 @@ int32_t SubTensorInfo::offset_element_in_bytes(const Coordinates &pos) const
int32_t offset = offset_first_element_in_bytes();
const Strides &strides = strides_in_bytes();
- for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+ for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
{
offset += pos[i] * strides[i];
}
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 5905ba5215..31bddbde40 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/Utils.h"
#include <memory>
@@ -34,13 +35,26 @@
namespace arm_compute
{
TensorInfo::TensorInfo()
- : _total_size(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _dims_state(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true },
- _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true), _id(invalid_tensor_id), _lock_paddings(false)
-{
-}
-
-TensorInfo::TensorInfo(const ITensorInfo &info)
- : TensorInfo()
+ : _total_size(0),
+ _offset_first_element_in_bytes(0),
+ _strides_in_bytes(),
+ _num_channels(0),
+ _tensor_shape(),
+ _dims_state(),
+ _data_type(DataType::UNKNOWN),
+ _format(Format::UNKNOWN),
+ _is_resizable{true},
+ _valid_region{Coordinates(), _tensor_shape},
+ _padding{0},
+ _quantization_info(),
+ _data_layout(DataLayout::NCHW),
+ _are_values_constant(true),
+ _id(invalid_tensor_id),
+ _lock_paddings(false)
+{
+}
+
+TensorInfo::TensorInfo(const ITensorInfo &info) : TensorInfo()
{
_total_size = info.total_size();
_offset_first_element_in_bytes = info.offset_first_element_in_bytes();
@@ -60,8 +74,7 @@ TensorInfo::TensorInfo(const ITensorInfo &info)
_lock_paddings = info.lock_paddings();
}
-TensorInfo::TensorInfo(const TensorInfo &info)
- : TensorInfo()
+TensorInfo::TensorInfo(const TensorInfo &info) : TensorInfo()
{
_total_size = info.total_size();
_offset_first_element_in_bytes = info.offset_first_element_in_bytes();
@@ -80,8 +93,7 @@ TensorInfo::TensorInfo(const TensorInfo &info)
_id = info.id();
_lock_paddings = false;
}
-TensorInfo::TensorInfo(Format format)
- : TensorInfo(TensorShape(), format)
+TensorInfo::TensorInfo(Format format) : TensorInfo(TensorShape(), format)
{
}
@@ -90,25 +102,25 @@ TensorInfo::TensorInfo(unsigned int width, unsigned int height, Format format)
{
}
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format)
- : TensorInfo()
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format) : TensorInfo()
{
init(tensor_shape, format);
}
-TensorInfo::TensorInfo(size_t num_channels, DataType data_type)
- : TensorInfo()
+TensorInfo::TensorInfo(size_t num_channels, DataType data_type) : TensorInfo()
{
init(TensorShape(), num_channels, data_type);
}
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type)
- : TensorInfo()
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type) : TensorInfo()
{
init(tensor_shape, num_channels, data_type);
}
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info)
+TensorInfo::TensorInfo(const TensorShape &tensor_shape,
+ size_t num_channels,
+ DataType data_type,
+ QuantizationInfo quantization_info)
: TensorInfo()
{
init(tensor_shape, num_channels, data_type);
@@ -137,9 +149,11 @@ void TensorInfo::init(const TensorShape &tensor_shape, Format format)
_format = format;
}
-void TensorInfo::init(const TensorShape &tensor_shape, Format format,
- const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
- size_t total_size_in_bytes)
+void TensorInfo::init(const TensorShape &tensor_shape,
+ Format format,
+ const Strides &strides_in_bytes,
+ size_t offset_first_element_in_bytes,
+ size_t total_size_in_bytes)
{
size_t num_channels = num_channels_from_format(format);
const DataType type = data_type_from_format(format);
@@ -165,9 +179,12 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data
set_tensor_shape(tensor_shape);
}
-void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type,
- const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
- size_t total_size_in_bytes)
+void TensorInfo::init(const TensorShape &tensor_shape,
+ size_t num_channels,
+ DataType data_type,
+ const Strides &strides_in_bytes,
+ size_t offset_first_element_in_bytes,
+ size_t total_size_in_bytes)
{
ARM_COMPUTE_ERROR_ON(num_channels == 0);
@@ -179,7 +196,7 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data
_strides_in_bytes = strides_in_bytes;
_total_size = total_size_in_bytes;
- _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+ _valid_region = ValidRegion{Coordinates(), _tensor_shape};
}
size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, Format format)
@@ -202,7 +219,7 @@ size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num
_format = Format::UNKNOWN;
_tensor_shape = tensor_shape;
- _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+ _valid_region = ValidRegion{Coordinates(), _tensor_shape};
auto_padding();
@@ -233,11 +250,11 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c
size_t required_total_size = 0;
const size_t required_offset_first_element = padding.left * stride_x + padding.top * stride_y;
- switch(_tensor_shape.num_dimensions())
+ switch (_tensor_shape.num_dimensions())
{
case 0:
{
- if(_tensor_shape.total_size() > 0)
+ if (_tensor_shape.total_size() > 0)
{
required_strides = Strides(stride_x, stride_x);
required_total_size = stride_z;
@@ -258,7 +275,8 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c
const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
- required_total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension];
+ required_total_size =
+ static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension];
break;
}
}
@@ -284,25 +302,25 @@ bool TensorInfo::extend_padding(const PaddingSize &padding)
bool updated = false;
- if(padding.top > _padding.top)
+ if (padding.top > _padding.top)
{
_padding.top = padding.top;
updated = true;
}
- if(padding.right > _padding.right)
+ if (padding.right > _padding.right)
{
_padding.right = padding.right;
updated = true;
}
- if(padding.bottom > _padding.bottom)
+ if (padding.bottom > _padding.bottom)
{
_padding.bottom = padding.bottom;
updated = true;
}
- if(padding.left > _padding.left)
+ if (padding.left > _padding.left)
{
_padding.left = padding.left;
updated = true;
@@ -336,7 +354,7 @@ ITensorInfo &TensorInfo::set_format(Format format)
{
_format = format;
- if(_data_type == DataType::UNKNOWN)
+ if (_data_type == DataType::UNKNOWN)
{
_num_channels = num_channels_from_format(format);
_data_type = data_type_from_format(format);
@@ -355,19 +373,19 @@ ITensorInfo &TensorInfo::set_tensor_shape(const TensorShape &shape)
_offset_first_element_in_bytes = 0;
_strides_in_bytes = compute_strides(*this);
- if(_tensor_shape.num_dimensions() == 0)
+ if (_tensor_shape.num_dimensions() == 0)
{
_total_size = _strides_in_bytes[0];
}
else
{
const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
- _total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension];
+ _total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension];
}
std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
- _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+ _valid_region = ValidRegion{Coordinates(), _tensor_shape};
return *this;
}
@@ -392,9 +410,10 @@ ITensorInfo &TensorInfo::set_data_layout(const DataLayout &data_layout)
ITensorInfo &TensorInfo::reset_padding()
{
_padding = PaddingSize();
- if(((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0)
+ if (((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0)
{
- std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
+ std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) =
+ calculate_padding_requirements(_padding);
}
return *this;
}
@@ -405,7 +424,7 @@ int32_t TensorInfo::offset_element_in_bytes(const Coordinates &pos) const
int32_t offset = _offset_first_element_in_bytes;
- for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+ for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
{
offset += pos[i] * _strides_in_bytes[i];
}
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 1ca7adb3a8..90a7ac32c0 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -49,7 +49,7 @@ std::string read_file(const std::string &filename, bool binary)
fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
std::ios_base::openmode mode = std::ios::in;
- if(binary)
+ if (binary)
{
mode |= std::ios::binary;
}
@@ -66,7 +66,7 @@ std::string read_file(const std::string &filename, bool binary)
out.assign(std::istreambuf_iterator<char>(fs), std::istreambuf_iterator<char>());
#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
- catch(const std::ifstream::failure &e)
+ catch (const std::ifstream::failure &e)
{
ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", filename.c_str(), e.what());
}
@@ -77,32 +77,28 @@ std::string read_file(const std::string &filename, bool binary)
const std::string &string_from_channel(Channel channel)
{
- static std::map<Channel, const std::string> channels_map =
- {
- { Channel::UNKNOWN, "UNKNOWN" },
- { Channel::R, "R" },
- { Channel::G, "G" },
- { Channel::B, "B" },
- { Channel::A, "A" },
- { Channel::Y, "Y" },
- { Channel::U, "U" },
- { Channel::V, "V" },
- { Channel::C0, "C0" },
- { Channel::C1, "C1" },
- { Channel::C2, "C2" },
- { Channel::C3, "C3" }
- };
+ static std::map<Channel, const std::string> channels_map = {{Channel::UNKNOWN, "UNKNOWN"},
+ {Channel::R, "R"},
+ {Channel::G, "G"},
+ {Channel::B, "B"},
+ {Channel::A, "A"},
+ {Channel::Y, "Y"},
+ {Channel::U, "U"},
+ {Channel::V, "V"},
+ {Channel::C0, "C0"},
+ {Channel::C1, "C1"},
+ {Channel::C2, "C2"},
+ {Channel::C3, "C3"}};
return channels_map[channel];
}
const std::string &string_from_border_mode(BorderMode border_mode)
{
- static std::map<BorderMode, const std::string> border_mode_map =
- {
- { BorderMode::UNDEFINED, "UNDEFINED" },
- { BorderMode::CONSTANT, "CONSTANT" },
- { BorderMode::REPLICATE, "REPLICATE" },
+ static std::map<BorderMode, const std::string> border_mode_map = {
+ {BorderMode::UNDEFINED, "UNDEFINED"},
+ {BorderMode::CONSTANT, "CONSTANT"},
+ {BorderMode::REPLICATE, "REPLICATE"},
};
return border_mode_map[border_mode];
@@ -110,11 +106,10 @@ const std::string &string_from_border_mode(BorderMode border_mode)
const std::string &string_from_norm_type(NormType type)
{
- static std::map<NormType, const std::string> norm_type_map =
- {
- { NormType::IN_MAP_1D, "IN_MAP_1D" },
- { NormType::IN_MAP_2D, "IN_MAP_2D" },
- { NormType::CROSS_MAP, "CROSS_MAP" },
+ static std::map<NormType, const std::string> norm_type_map = {
+ {NormType::IN_MAP_1D, "IN_MAP_1D"},
+ {NormType::IN_MAP_2D, "IN_MAP_2D"},
+ {NormType::CROSS_MAP, "CROSS_MAP"},
};
return norm_type_map[type];
@@ -122,11 +117,10 @@ const std::string &string_from_norm_type(NormType type)
const std::string &string_from_pooling_type(PoolingType type)
{
- static std::map<PoolingType, const std::string> pool_type_map =
- {
- { PoolingType::MAX, "MAX" },
- { PoolingType::AVG, "AVG" },
- { PoolingType::L2, "L2" },
+ static std::map<PoolingType, const std::string> pool_type_map = {
+ {PoolingType::MAX, "MAX"},
+ {PoolingType::AVG, "AVG"},
+ {PoolingType::L2, "L2"},
};
return pool_type_map[type];
@@ -134,38 +128,36 @@ const std::string &string_from_pooling_type(PoolingType type)
bool is_pool_region_entirely_outside_input(const PoolingLayerInfo &info)
{
- if(info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0)
+ if (info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0)
{
return false;
}
const auto ps = info.pad_stride_info;
- const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.pad_left(), ps.pad_right() });
- const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.pad_top(), ps.pad_bottom() });
+ const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.pad_left(), ps.pad_right()});
+ const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.pad_top(), ps.pad_bottom()});
return pool_le_padding_x || pool_le_padding_y;
}
bool is_pool_3d_region_entirely_outside_input(const Pooling3dLayerInfo &info)
{
- if(info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0)
+ if (info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0)
{
return false;
}
const auto ps = info.padding;
- const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.left, ps.right });
- const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.top, ps.bottom });
- const auto pool_le_padding_z = info.pool_size.z() <= std::max({ ps.front, ps.back });
+ const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.left, ps.right});
+ const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.top, ps.bottom});
+ const auto pool_le_padding_z = info.pool_size.z() <= std::max({ps.front, ps.back});
return pool_le_padding_x || pool_le_padding_y || pool_le_padding_z;
}
const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage)
{
- static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map =
- {
- { GEMMLowpOutputStageType::NONE, "" },
- { GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down" },
- { GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint" },
- { GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float" }
- };
+ static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map = {
+ {GEMMLowpOutputStageType::NONE, ""},
+ {GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down"},
+ {GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint"},
+ {GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float"}};
return output_stage_map[output_stage];
}
@@ -175,7 +167,7 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data
std::stringstream ss;
std::string converted_string;
- switch(data_type)
+ switch (data_type)
{
case DataType::U8:
case DataType::QASYMM8:
@@ -223,11 +215,16 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data
return converted_string;
}
-PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation,
+PadStrideInfo calculate_same_pad(TensorShape input_shape,
+ TensorShape weights_shape,
+ PadStrideInfo conv_info,
+ DataLayout data_layout,
+ const Size2D &dilation,
const DimensionRoundingType &rounding_type)
{
const auto &strides = conv_info.stride();
- ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1), "Stride values should be greater than or equal to 1.");
+ ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1),
+ "Stride values should be greater than or equal to 1.");
const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -246,8 +243,9 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
const int real_weight_height = (kernel_height - 1) * dilation.y() + 1;
// Calculate total pad
- const int pad_width = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width));
- const int pad_height = std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height));
+ const int pad_width = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width));
+ const int pad_height =
+ std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height));
// Calculate individual paddings
const unsigned int pad_left = pad_width / 2;
@@ -265,8 +263,10 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
return same_info;
}
-std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
- unsigned int kernel_width, unsigned int kernel_height,
+std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width,
+ unsigned int in_height,
+ unsigned int kernel_width,
+ unsigned int kernel_height,
const PadStrideInfo &pad_stride_info)
{
const unsigned int pad_left = pad_stride_info.pad_left();
@@ -285,8 +285,10 @@ std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned i
return std::make_pair<unsigned int, unsigned int>(w, h);
}
-std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
- int kernel_width, int kernel_height,
+std::pair<unsigned int, unsigned int> scaled_dimensions(int width,
+ int height,
+ int kernel_width,
+ int kernel_height,
const PadStrideInfo &pad_stride_info,
const Size2D &dilation)
{
@@ -300,15 +302,25 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
const int stride_y = pad_stride_info.stride().second;
int w = 0;
int h = 0;
- switch(pad_stride_info.round())
+ switch (pad_stride_info.round())
{
case DimensionRoundingType::FLOOR:
- w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1));
- h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1));
+ w = static_cast<int>(std::floor(
+ (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) +
+ 1));
+ h = static_cast<int>(
+ std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) /
+ stride_y) +
+ 1));
break;
case DimensionRoundingType::CEIL:
- w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1));
- h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1));
+ w = static_cast<int>(std::ceil(
+ (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) +
+ 1));
+ h = static_cast<int>(
+ std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) /
+ stride_y) +
+ 1));
break;
default:
ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -319,9 +331,8 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
return std::make_pair(static_cast<unsigned int>(w), static_cast<unsigned int>(h));
}
-std::pair<int, int> scaled_dimensions_signed(int width, int height,
- int kernel_width, int kernel_height,
- const PadStrideInfo &pad_stride_info)
+std::pair<int, int> scaled_dimensions_signed(
+ int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info)
{
const int pad_left = pad_stride_info.pad_left();
const int pad_top = pad_stride_info.pad_top();
@@ -331,15 +342,19 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height,
const int stride_y = pad_stride_info.stride().second;
int w = 0;
int h = 0;
- switch(pad_stride_info.round())
+ switch (pad_stride_info.round())
{
case DimensionRoundingType::FLOOR:
- w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
- h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+ w = static_cast<int>(
+ std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+ h = static_cast<int>(
+ std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
break;
case DimensionRoundingType::CEIL:
- w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
- h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+ w = static_cast<int>(
+ std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+ h = static_cast<int>(
+ std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
break;
default:
ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -348,8 +363,12 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height,
return std::make_pair(static_cast<int>(w), static_cast<int>(h));
}
-std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, int height, int depth,
- int kernel_width, int kernel_height, int kernel_depth,
+std::tuple<int, int, int> scaled_3d_dimensions_signed(int width,
+ int height,
+ int depth,
+ int kernel_width,
+ int kernel_height,
+ int kernel_depth,
const Pooling3dLayerInfo &pool3d_info)
{
const int pad_left = pool3d_info.padding.left;
@@ -365,17 +384,23 @@ std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, int height, int
int h = 0;
int d = 0;
- switch(pool3d_info.round_type)
+ switch (pool3d_info.round_type)
{
case DimensionRoundingType::FLOOR:
- w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
- h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
- d = static_cast<int>(std::floor((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
+ w = static_cast<int>(
+ std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+ h = static_cast<int>(
+ std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+ d = static_cast<int>(
+ std::floor((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
break;
case DimensionRoundingType::CEIL:
- w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
- h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
- d = static_cast<int>(std::ceil((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
+ w = static_cast<int>(
+ std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+ h = static_cast<int>(
+ std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+ d = static_cast<int>(
+ std::ceil((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
break;
default:
ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -400,9 +425,9 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
// * Softmax with QASYMM8_SIGNED: scale = 1/256, offset = -128
// * LogSoftmax with QASYMM8: scale = 1/256, offset = 0
// * LogSoftmax with QASYMM8_SIGNED: scale = 16/256, offset = 127
- if(is_data_type_quantized_asymmetric_signed(input_type))
+ if (is_data_type_quantized_asymmetric_signed(input_type))
{
- if(is_log)
+ if (is_log)
{
return QuantizationInfo(16.f / 256, 127);
}
@@ -414,17 +439,21 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
return QuantizationInfo(1.f / 256, 0);
}
-std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info, DataType data_type, UniformQuantizationInfo oq_info)
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info,
+ DataType data_type,
+ UniformQuantizationInfo oq_info)
{
const bool is_qasymm8_signed = is_data_type_quantized_asymmetric_signed(data_type);
const auto a = act_info.a();
const auto b = act_info.b();
- const int a_int = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info);
- const int b_int = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
- const auto type_max_value = std::get<1>(get_min_max(data_type)).get<int32_t>();
+ const int a_int = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info);
+ const int b_int = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
+ const auto type_max_value = std::get<1>(get_min_max(data_type)).get<int32_t>();
- const int32_t min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int;
- const int32_t max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
+ const int32_t min_activation =
+ act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int;
+ const int32_t max_activation =
+ act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
return std::make_pair(min_activation, max_activation);
}
@@ -433,11 +462,11 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
{
std::unordered_map<const ITensorInfo *, PaddingSize> res;
- for(const ITensor *tensor : tensors)
+ for (const ITensor *tensor : tensors)
{
- if(tensor)
+ if (tensor)
{
- res.insert({ tensor->info(), tensor->info()->padding() });
+ res.insert({tensor->info(), tensor->info()->padding()});
}
}
@@ -448,11 +477,11 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
{
std::unordered_map<const ITensorInfo *, PaddingSize> res;
- for(const ITensorInfo *info : infos)
+ for (const ITensorInfo *info : infos)
{
- if(info)
+ if (info)
{
- res.insert({ info, info->padding() });
+ res.insert({info, info->padding()});
}
}
@@ -461,17 +490,20 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map)
{
- return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
- {
- return (padding_info.first->padding() != padding_info.second);
- })
- != padding_map.end();
+ return std::find_if(padding_map.begin(), padding_map.end(),
+ [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
+ { return (padding_info.first->padding() != padding_info.second); }) != padding_map.end();
}
#ifdef ARM_COMPUTE_ASSERTS_ENABLED
-void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
+void print_consecutive_elements(std::ostream &s,
+ DataType dt,
+ const uint8_t *ptr,
+ unsigned int n,
+ int stream_width,
+ const std::string &element_delim)
{
- switch(dt)
+ switch (dt)
{
case DataType::U8:
case DataType::QASYMM8:
@@ -481,36 +513,46 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
case DataType::QSYMM8:
case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8_PER_CHANNEL:
- print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
+ print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width,
+ element_delim);
break;
case DataType::U16:
case DataType::QASYMM16:
- print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
+ print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width,
+ element_delim);
break;
case DataType::S16:
case DataType::QSYMM16:
- print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, element_delim);
+ print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width,
+ element_delim);
break;
case DataType::U32:
- print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width, element_delim);
+ print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width,
+ element_delim);
break;
case DataType::S32:
- print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width, element_delim);
+ print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width,
+ element_delim);
break;
case DataType::U64:
- print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width, element_delim);
+ print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width,
+ element_delim);
break;
case DataType::S64:
- print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width, element_delim);
+ print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width,
+ element_delim);
break;
case DataType::BFLOAT16:
- print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width, element_delim);
+ print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width,
+ element_delim);
break;
case DataType::F16:
- print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width, element_delim);
+ print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width,
+ element_delim);
break;
case DataType::F32:
- print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width, element_delim);
+ print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width,
+ element_delim);
break;
default:
ARM_COMPUTE_ERROR("Undefined element size for given data type");
@@ -519,7 +561,7 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n)
{
- switch(dt)
+ switch (dt)
{
case DataType::U8:
case DataType::QASYMM8:
diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index 5a6486e11e..d8f796193e 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp
@@ -23,13 +23,16 @@
*/
#include "arm_compute/core/Validate.h"
-arm_compute::Status arm_compute::error_on_mismatching_windows(const char *function, const char *file, const int line,
- const arm_compute::Window &full, const arm_compute::Window &win)
+arm_compute::Status arm_compute::error_on_mismatching_windows(const char *function,
+ const char *file,
+ const int line,
+ const arm_compute::Window &full,
+ const arm_compute::Window &win)
{
full.validate();
win.validate();
- for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+ for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() != win[i].start(), function, file, line);
ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() != win[i].end(), function, file, line);
@@ -38,13 +41,16 @@ arm_compute::Status arm_compute::error_on_mismatching_windows(const char *functi
return arm_compute::Status{};
}
-arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function, const char *file, const int line,
- const arm_compute::Window &full, const arm_compute::Window &sub)
+arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function,
+ const char *file,
+ const int line,
+ const arm_compute::Window &full,
+ const arm_compute::Window &sub)
{
full.validate();
sub.validate();
- for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+ for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() > sub[i].start(), function, file, line);
ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() < sub[i].end(), function, file, line);
@@ -54,8 +60,12 @@ arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function
return arm_compute::Status{};
}
-arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
- const arm_compute::Window &full, const arm_compute::Window &window, const int dim)
+arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char *function,
+ const char *file,
+ const int line,
+ const arm_compute::Window &full,
+ const arm_compute::Window &window,
+ const int dim)
{
full.validate();
window.validate();
@@ -67,65 +77,73 @@ arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(co
return arm_compute::Status{};
}
-arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
- const arm_compute::Coordinates &pos, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(
+ const char *function, const char *file, const int line, const arm_compute::Coordinates &pos, unsigned int max_dim)
{
- for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+ for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON_LOC(pos[i] != 0, function, file, line);
}
return arm_compute::Status{};
}
-arm_compute::Status arm_compute::error_on_window_dimensions_gte(const char *function, const char *file, const int line,
- const arm_compute::Window &win, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_window_dimensions_gte(
+ const char *function, const char *file, const int line, const arm_compute::Window &win, unsigned int max_dim)
{
- for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+ for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
{
- ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR((win[i].start() != 0) || (win[i].end() != win[i].step()),
- function, file, line,
- "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
+ ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+ (win[i].start() != 0) || (win[i].end() != win[i].step()), function, file, line,
+ "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
}
return arm_compute::Status{};
}
-arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function,
+ const char *file,
+ const int line,
const arm_compute::ITensor *tensor)
{
ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor->info() == nullptr, function, file, line);
- ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2,
- function, file, line,
- "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->info()->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2, function, file, line,
+ "Only 2D Tensors are supported by this kernel (%zu passed)",
+ tensor->info()->num_dimensions());
return arm_compute::Status{};
}
-arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function,
+ const char *file,
+ const int line,
const arm_compute::ITensorInfo *tensor)
{
ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
- ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2,
- function, file, line,
- "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2, function, file, line,
+ "Only 2D Tensors are supported by this kernel (%zu passed)",
+ tensor->num_dimensions());
return arm_compute::Status{};
}
-arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
- arm_compute::Format fmt, arm_compute::Channel cn)
+arm_compute::Status arm_compute::error_on_channel_not_in_known_format(
+ const char *function, const char *file, const int line, arm_compute::Format fmt, arm_compute::Channel cn)
{
ARM_COMPUTE_RETURN_ERROR_ON_LOC(fmt == arm_compute::Format::UNKNOWN, function, file, line);
ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == arm_compute::Channel::UNKNOWN, function, file, line);
- switch(fmt)
+ switch (fmt)
{
case arm_compute::Format::RGB888:
- arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B);
+ arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R,
+ arm_compute::Channel::G, arm_compute::Channel::B);
break;
case arm_compute::Format::RGBA8888:
- arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B, arm_compute::Channel::A);
+ arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R,
+ arm_compute::Channel::G, arm_compute::Channel::B,
+ arm_compute::Channel::A);
break;
case arm_compute::Format::UV88:
- arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U, arm_compute::Channel::V);
+ arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U,
+ arm_compute::Channel::V);
break;
case arm_compute::Format::IYUV:
case arm_compute::Format::UYVY422:
@@ -133,7 +151,8 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char
case arm_compute::Format::NV12:
case arm_compute::Format::NV21:
case arm_compute::Format::YUV444:
- arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y, arm_compute::Channel::U, arm_compute::Channel::V);
+ arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y,
+ arm_compute::Channel::U, arm_compute::Channel::V);
break;
default:
ARM_COMPUTE_ERROR_LOC(function, file, line, "Not supported format.");
@@ -141,21 +160,26 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char
return arm_compute::Status{};
}
-arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *function,
+ const char *file,
+ const int line,
const arm_compute::IKernel *kernel)
{
ARM_COMPUTE_RETURN_ERROR_ON_LOC(kernel == nullptr, function, file, line);
- ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(),
- function, file, line,
+ ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(), function, file, line,
"This kernel hasn't been configured.");
return arm_compute::Status{};
}
-arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
- const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
+arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function,
+ const char *file,
+ const int line,
+ const TensorShape &parent_shape,
+ const Coordinates &coords,
+ const TensorShape &shape)
{
// Check dimensions
- for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+ for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
{
const bool invalid_idx = coords[i] >= static_cast<int>(parent_shape[i]);
const bool out_of_bounds_size = coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]);
@@ -164,15 +188,20 @@ arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function
return arm_compute::Status{};
}
-arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
- const ValidRegion &parent_valid_region, const ValidRegion &valid_region)
+arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char *function,
+ const char *file,
+ const int line,
+ const ValidRegion &parent_valid_region,
+ const ValidRegion &valid_region)
{
// Check valid regions
- for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
+ for (unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
{
ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line);
- ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
- function, file, line);
+ ARM_COMPUTE_RETURN_ERROR_ON_LOC(
+ (parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) <
+ (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
+ function, file, line);
}
return arm_compute::Status{};
diff --git a/src/core/common/Macros.h b/src/core/common/Macros.h
index d791154e5c..bc0ea29911 100644
--- a/src/core/common/Macros.h
+++ b/src/core/common/Macros.h
@@ -25,9 +25,9 @@
#define ARM_COMPUTE_COMMON_MACROS_H
#define ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(TypeName) \
- TypeName(const TypeName &) = delete; \
+ TypeName(const TypeName &) = delete; \
TypeName &operator=(const TypeName &) = delete; \
TypeName(TypeName &&) = default; \
- TypeName &operator=(TypeName &&) = default
+ TypeName &operator=(TypeName &&) = default
#endif /* ARM_COMPUTE_COMMON_MACROS_H */
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index d6dc3449fc..686304b8d7 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -46,7 +46,7 @@
#else /* !defined(ENABLE_FP16_KERNELS) */
#define REGISTER_FP16_NEON(func_name) nullptr
-#define REGISTER_FP16_SVE(func_name) nullptr
+#define REGISTER_FP16_SVE(func_name) nullptr
#define REGISTER_FP16_SVE2(func_name) nullptr
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
@@ -72,7 +72,7 @@
#else /* defined(ENABLE_FP32_KERNELS) */
#define REGISTER_FP32_NEON(func_name) nullptr
-#define REGISTER_FP32_SVE(func_name) nullptr
+#define REGISTER_FP32_SVE(func_name) nullptr
#define REGISTER_FP32_SVE2(func_name) nullptr
#endif /* defined(ENABLE_FP32_KERNELS) */
@@ -94,7 +94,7 @@
#else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
#define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr
-#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
#define REGISTER_QASYMM8_SIGNED_SVE2(func_name) nullptr
#endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
@@ -115,7 +115,7 @@
#else /* defined(ENABLE_QASYMM8_KERNELS) */
#define REGISTER_QASYMM8_NEON(func_name) nullptr
-#define REGISTER_QASYMM8_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SVE(func_name) nullptr
#define REGISTER_QASYMM8_SVE2(func_name) nullptr
#endif /* defined(ENABLE_QASYMM8_KERNELS) */
@@ -137,7 +137,7 @@
#else /* defined(ENABLE_QSYMM16_KERNELS) */
#define REGISTER_QSYMM16_NEON(func_name) nullptr
-#define REGISTER_QSYMM16_SVE(func_name) nullptr
+#define REGISTER_QSYMM16_SVE(func_name) nullptr
#define REGISTER_QSYMM16_SVE2(func_name) nullptr
#endif /* defined(ENABLE_QSYMM16_KERNELS) */
@@ -169,7 +169,7 @@
#else /* defined(ENABLE_INTEGER_KERNELS) */
#define REGISTER_INTEGER_NEON(func_name) nullptr
-#define REGISTER_INTEGER_SVE(func_name) nullptr
+#define REGISTER_INTEGER_SVE(func_name) nullptr
#define REGISTER_INTEGER_SVE2(func_name) nullptr
#endif /* defined(ENABLE_INTEGER_KERNELS) */
diff --git a/src/core/helpers/AutoConfiguration.h b/src/core/helpers/AutoConfiguration.h
index 8715dcd74b..9df2a76983 100644
--- a/src/core/helpers/AutoConfiguration.h
+++ b/src/core/helpers/AutoConfiguration.h
@@ -24,9 +24,9 @@
#ifndef SRC_CORE_HELPERS_AUTOCONFIGURATION_H
#define SRC_CORE_HELPERS_AUTOCONFIGURATION_H
-#include "arm_compute/core/utils/DataTypeUtils.h"
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
namespace arm_compute
{
@@ -42,10 +42,11 @@ namespace arm_compute
*/
inline bool auto_init_if_empty(ITensorInfo &info,
const TensorShape &shape,
- int num_channels, DataType data_type,
- QuantizationInfo quantization_info = QuantizationInfo())
+ int num_channels,
+ DataType data_type,
+ QuantizationInfo quantization_info = QuantizationInfo())
{
- if(info.tensor_shape().total_size() == 0)
+ if (info.tensor_shape().total_size() == 0)
{
info.set_data_type(data_type);
info.set_num_channels(num_channels);
@@ -70,7 +71,7 @@ inline bool auto_init_if_empty(ITensorInfo &info,
*/
inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source)
{
- if(info_sink.tensor_shape().total_size() == 0)
+ if (info_sink.tensor_shape().total_size() == 0)
{
info_sink.set_data_type(info_source.data_type());
info_sink.set_num_channels(info_source.num_channels());
@@ -93,7 +94,7 @@ inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_s
*/
inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
{
- if(info.tensor_shape().total_size() == 0)
+ if (info.tensor_shape().total_size() == 0)
{
info.set_tensor_shape(shape);
return true;
@@ -112,7 +113,7 @@ inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
*/
inline bool set_format_if_unknown(ITensorInfo &info, Format format)
{
- if(info.data_type() == DataType::UNKNOWN)
+ if (info.data_type() == DataType::UNKNOWN)
{
info.set_format(format);
return true;
@@ -131,7 +132,7 @@ inline bool set_format_if_unknown(ITensorInfo &info, Format format)
*/
inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
{
- if(info.data_type() == DataType::UNKNOWN)
+ if (info.data_type() == DataType::UNKNOWN)
{
info.set_data_type(data_type);
return true;
@@ -150,7 +151,7 @@ inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
*/
inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout)
{
- if(info.data_layout() == DataLayout::UNKNOWN)
+ if (info.data_layout() == DataLayout::UNKNOWN)
{
info.set_data_layout(data_layout);
return true;
@@ -169,7 +170,7 @@ inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout
*/
inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
{
- if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
+ if (info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
{
info.set_quantization_info(quantization_info);
return true;
diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h
index a41052687b..dd094b414c 100644
--- a/src/core/helpers/MemoryHelpers.h
+++ b/src/core/helpers/MemoryHelpers.h
@@ -24,9 +24,9 @@
#ifndef SRC_COMMON_MEMORY_HELPERS_H
#define SRC_COMMON_MEMORY_HELPERS_H
+#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include <memory>
@@ -43,18 +43,17 @@ inline int offset_int_vec(int offset)
template <typename TensorType>
struct WorkspaceDataElement
{
- int slot{ -1 };
- experimental::MemoryLifetime lifetime{ experimental::MemoryLifetime::Temporary };
- std::unique_ptr<TensorType> tensor{ nullptr };
+ int slot{-1};
+ experimental::MemoryLifetime lifetime{experimental::MemoryLifetime::Temporary};
+ std::unique_ptr<TensorType> tensor{nullptr};
};
template <typename TensorType>
using WorkspaceData = std::vector<WorkspaceDataElement<TensorType>>;
template <typename TensorType>
-WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
- MemoryGroup &mgroup,
- ITensorPack &run_pack)
+WorkspaceData<TensorType>
+manage_workspace(const experimental::MemoryRequirements &mem_reqs, MemoryGroup &mgroup, ITensorPack &run_pack)
{
ITensorPack dummy_pack = ITensorPack();
return manage_workspace<TensorType>(mem_reqs, mgroup, run_pack, dummy_pack);
@@ -63,24 +62,26 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
template <typename TensorType>
WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
MemoryGroup &mgroup,
- ITensorPack &run_pack, ITensorPack &prep_pack)
+ ITensorPack &run_pack,
+ ITensorPack &prep_pack)
{
WorkspaceData<TensorType> workspace_memory;
- for(const auto &req : mem_reqs)
+ for (const auto &req : mem_reqs)
{
- if(req.size == 0)
+ if (req.size == 0)
{
continue;
}
- const auto aux_info = TensorInfo{ TensorShape(req.size), 1, DataType::U8 };
- workspace_memory.emplace_back(WorkspaceDataElement<TensorType> { req.slot, req.lifetime, std::make_unique<TensorType>() });
+ const auto aux_info = TensorInfo{TensorShape(req.size), 1, DataType::U8};
+ workspace_memory.emplace_back(
+ WorkspaceDataElement<TensorType>{req.slot, req.lifetime, std::make_unique<TensorType>()});
auto aux_tensor = workspace_memory.back().tensor.get();
ARM_COMPUTE_ERROR_ON_NULLPTR(aux_tensor);
aux_tensor->allocator()->init(aux_info, req.alignment);
- if(req.lifetime == experimental::MemoryLifetime::Temporary)
+ if (req.lifetime == experimental::MemoryLifetime::Temporary)
{
mgroup.manage(aux_tensor);
}
@@ -91,7 +92,7 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
run_pack.add_tensor(req.slot, aux_tensor);
}
- for(auto &mem : workspace_memory)
+ for (auto &mem : workspace_memory)
{
auto tensor = mem.tensor.get();
tensor->allocator()->allocate();
@@ -103,31 +104,29 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
template <typename TensorType>
void release_prepare_tensors(WorkspaceData<TensorType> &workspace, ITensorPack &prep_pack)
{
- workspace.erase(std::remove_if(workspace.begin(),
- workspace.end(),
- [&prep_pack](auto & wk)
- {
- const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare;
- if(to_erase)
- {
- prep_pack.remove_tensor(wk.slot);
- }
- return to_erase;
- }),
- workspace.end());
+ workspace.erase(std::remove_if(workspace.begin(), workspace.end(),
+ [&prep_pack](auto &wk)
+ {
+ const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare;
+ if (to_erase)
+ {
+ prep_pack.remove_tensor(wk.slot);
+ }
+ return to_erase;
+ }),
+ workspace.end());
}
/** Utility function to release tensors with lifetime marked as Prepare */
template <typename TensorType>
-void release_temporaries(const experimental::MemoryRequirements &mem_reqs,
- WorkspaceData<TensorType> &workspace)
+void release_temporaries(const experimental::MemoryRequirements &mem_reqs, WorkspaceData<TensorType> &workspace)
{
- for(auto &ws : workspace)
+ for (auto &ws : workspace)
{
const int slot = ws.slot;
- for(auto &m : mem_reqs)
+ for (auto &m : mem_reqs)
{
- if(m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare)
+ if (m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare)
{
auto tensor = ws.tensor.get();
tensor->allocator()->free();
diff --git a/src/core/helpers/PoolingHelpers.h b/src/core/helpers/PoolingHelpers.h
index 079629ee6a..9ef045f472 100644
--- a/src/core/helpers/PoolingHelpers.h
+++ b/src/core/helpers/PoolingHelpers.h
@@ -33,8 +33,20 @@ namespace cpu
namespace
{
-inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w,
- const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z)
+inline float calculate_avg_scale_pool3d(bool exclude_padding,
+ const Coordinates &id,
+ const int pool_size_x,
+ const int pool_size_y,
+ const int pool_size_z,
+ const int upper_bound_w,
+ const int upper_bound_h,
+ const int upper_bound_d,
+ const int pad_x,
+ const int pad_y,
+ const int pad_z,
+ const int stride_x,
+ const int stride_y,
+ const int stride_z)
{
// Based on NDHWC
int start_x = id[1] * stride_x - pad_x;
@@ -44,7 +56,7 @@ inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates
const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
const int end_z = std::min(start_z + pool_size_z, upper_bound_d);
- if(exclude_padding)
+ if (exclude_padding)
{
start_x = std::max(0, start_x);
start_y = std::max(0, start_y);
@@ -53,8 +65,17 @@ inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates
return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z));
}
-inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
- const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+inline float calculate_avg_scale_pool2d(bool exclude_padding,
+ DataLayout data_layout,
+ const Coordinates &id,
+ const int pool_size_x,
+ const int pool_size_y,
+ const int upper_bound_w,
+ const int upper_bound_h,
+ const int pad_x,
+ const int pad_y,
+ const int stride_x,
+ const int stride_y)
{
const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -64,7 +85,7 @@ inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_la
const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
- if(exclude_padding)
+ if (exclude_padding)
{
start_x = std::max(0, start_x);
start_y = std::max(0, start_y);
@@ -117,17 +138,26 @@ inline float32x4_t vcvtq_f32_q32(int32x4_t values)
}
template <typename Tout>
-inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
+inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+ const float quant_rescale,
+ const float scale_pooling,
+ const int32_t new_offset);
template <>
-inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
+inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+ const float quant_rescale,
+ const float scale_pooling,
+ const int32_t new_offset)
{
const float new_scale = quant_rescale / scale_pooling;
return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
}
template <>
-inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
+inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+ const float quant_rescale,
+ const float scale_pooling,
+ const int32_t new_offset)
{
const float new_scale = quant_rescale / scale_pooling;
return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
@@ -139,30 +169,24 @@ inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInf
template <>
inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
{
- const float32x4x4_t acc =
- {
- {
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
- }
- };
+ const float32x4x4_t acc = {{
+ vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
+ vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
+ vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
+ vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
+ }};
return vquantize(acc, requant_qinfo);
}
template <>
inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
{
- const float32x4x4_t acc =
- {
- {
- vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
- vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
- vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
- vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
- }
- };
+ const float32x4x4_t acc = {{
+ vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
+ vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
+ vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
+ vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
+ }};
return vquantize_signed(acc, requant_qinfo);
}
@@ -172,26 +196,20 @@ inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinf
template <>
inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
{
- const float32x4x2_t acc =
- {
- {
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
- }
- };
+ const float32x4x2_t acc = {{
+ vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
+ vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
+ }};
return vquantize(acc, requant_qinfo);
}
template <>
inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
{
- const float32x4x2_t acc =
- {
- {
- vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
- vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
- }
- };
+ const float32x4x2_t acc = {{
+ vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
+ vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
+ }};
return vquantize_signed(acc, requant_qinfo);
}
@@ -199,4 +217,3 @@ inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo
} // namespace cpu
} // namespace arm_compute
#endif /* SRC_CORE_HELPERS_POOLINGHELPERS_H */
-
diff --git a/src/core/helpers/ScaleHelpers.h b/src/core/helpers/ScaleHelpers.h
index e769bba782..47605e7385 100644
--- a/src/core/helpers/ScaleHelpers.h
+++ b/src/core/helpers/ScaleHelpers.h
@@ -50,8 +50,12 @@ namespace scale_helpers
*
* @return The bilinear interpolated pixel value
*/
-inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy,
- UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr,
+ size_t stride,
+ float dx,
+ float dy,
+ UniformQuantizationInfo iq_info,
+ UniformQuantizationInfo oq_info)
{
ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
@@ -85,8 +89,12 @@ inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stri
*
* @return The bilinear interpolated pixel value
*/
-inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy,
- UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr,
+ size_t stride,
+ float dx,
+ float dy,
+ UniformQuantizationInfo iq_info,
+ UniformQuantizationInfo oq_info)
{
ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
@@ -122,9 +130,8 @@ inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride
*
* @return The pixel at (x, y) using area interpolation.
*/
-inline uint8_t
-pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr,
- float hr, int x, int y)
+inline uint8_t pixel_area_c1u8_clamp(
+ const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
{
ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
@@ -159,7 +166,7 @@ pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t widt
// Sum pixels in area
int sum = 0;
- for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
+ for (int j = yi + y_from, je = yi + y_to; j <= je; ++j)
{
const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
sum = std::accumulate(ptr, ptr + x_elements, sum);
diff --git a/src/core/helpers/SoftmaxHelpers.cpp b/src/core/helpers/SoftmaxHelpers.cpp
index 71b971af31..8184991ab5 100644
--- a/src/core/helpers/SoftmaxHelpers.cpp
+++ b/src/core/helpers/SoftmaxHelpers.cpp
@@ -29,7 +29,7 @@ namespace softmax_helpers
{
PermutationVector get_permutation_vector_from_softmax_axis(size_t axis)
{
- switch(axis)
+ switch (axis)
{
case 1:
return PermutationVector(1U, 0U, 2U, 3U);
diff --git a/src/core/helpers/Utils.cpp b/src/core/helpers/Utils.cpp
index 3900475355..6ca29d180d 100644
--- a/src/core/helpers/Utils.cpp
+++ b/src/core/helpers/Utils.cpp
@@ -31,9 +31,9 @@ bool has_holes(const ITensorInfo &info, size_t dimension)
const auto &strides = info.strides_in_bytes();
size_t squashed_bytes = info.element_size();
- for(size_t dim = 0; dim <= dimension; ++dim)
+ for (size_t dim = 0; dim <= dimension; ++dim)
{
- if(strides[dim] != squashed_bytes)
+ if (strides[dim] != squashed_bytes)
{
return true;
}
diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h
index 7ad960bfa2..2e7224c55b 100644
--- a/src/core/helpers/Utils.h
+++ b/src/core/helpers/Utils.h
@@ -45,7 +45,7 @@ inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&...fixe
// Create strides object
Strides strides(stride_x, fixed_strides...);
- for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
+ for (size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
{
strides.set(i, shape[i - 1] * strides[i - 1]);
}
diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp
index a4d46db352..30a55fcbc6 100644
--- a/src/core/helpers/WindowHelpers.cpp
+++ b/src/core/helpers/WindowHelpers.cpp
@@ -25,9 +25,10 @@
namespace arm_compute
{
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+Window
+calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
{
- if(!skip_border)
+ if (!skip_border)
{
border_size = BorderSize(0);
}
@@ -38,40 +39,47 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps,
Window window;
window.set(0, Window::Dimension(
- // Skip the border left of the image
- anchor[0] + border_size.left,
- // Skip the border right of the image
- // Make sure the window width is a multiple of the step size
- anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
- steps[0]));
+ // Skip the border left of the image
+ anchor[0] + border_size.left,
+ // Skip the border right of the image
+ // Make sure the window width is a multiple of the step size
+ anchor[0] + border_size.left +
+ ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+ static_cast<int>(border_size.right)),
+ steps[0]),
+ steps[0]));
size_t n = 1;
- if(anchor.num_dimensions() > 1)
+ if (anchor.num_dimensions() > 1)
{
- window.set(1, Window::Dimension(
+ window.set(1,
+ Window::Dimension(
// Skip the border above the image
anchor[1] + border_size.top,
// Skip the border below the image
- anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
+ anchor[1] + border_size.top +
+ ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) -
+ static_cast<int>(border_size.bottom)),
+ steps[1]),
steps[1]));
++n;
}
- if(anchor.num_dimensions() > 2)
+ if (anchor.num_dimensions() > 2)
{
window.set(2, Window::Dimension(anchor[2], std::max<size_t>(1, shape[2]), steps[2]));
++n;
}
- for(; n < anchor.num_dimensions(); ++n)
+ for (; n < anchor.num_dimensions(); ++n)
{
window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
}
- for(; n < Coordinates::num_max_dimensions; ++n)
+ for (; n < Coordinates::num_max_dimensions; ++n)
{
window.set(n, Window::Dimension(0, 1));
}
@@ -81,7 +89,7 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps,
Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool skip_border, BorderSize border_size)
{
- if(!skip_border)
+ if (!skip_border)
{
border_size = BorderSize(0);
}
@@ -89,40 +97,46 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool s
Window window;
window.set(0, Window::Dimension(
- // Skip the border left of the image
- border_size.left,
- // Skip the border right of the image
- // Make sure the window width is a multiple of the step size
- border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
- steps[0]));
+ // Skip the border left of the image
+ border_size.left,
+ // Skip the border right of the image
+ // Make sure the window width is a multiple of the step size
+ border_size.left +
+ ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+ static_cast<int>(border_size.right)),
+ steps[0]),
+ steps[0]));
size_t n = 1;
- if(shape.num_dimensions() > 1)
+ if (shape.num_dimensions() > 1)
{
window.set(1, Window::Dimension(
- // Skip the border above the image
- border_size.top,
- // Skip the border below the image
- border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
- steps[1]));
+ // Skip the border above the image
+ border_size.top,
+ // Skip the border below the image
+ border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) -
+ static_cast<int>(border_size.top) -
+ static_cast<int>(border_size.bottom)),
+ steps[1]),
+ steps[1]));
++n;
}
- if(shape.num_dimensions() > 2)
+ if (shape.num_dimensions() > 2)
{
window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[2]), steps[2]));
++n;
}
- for(; n < shape.num_dimensions(); ++n)
+ for (; n < shape.num_dimensions(); ++n)
{
window.set(n, Window::Dimension(0, std::max<size_t>(1, shape[n])));
}
- for(; n < Coordinates::num_max_dimensions; ++n)
+ for (; n < Coordinates::num_max_dimensions; ++n)
{
window.set(n, Window::Dimension(0, 1));
}
@@ -138,40 +152,42 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
Window window;
window.set(0, Window::Dimension(
- // move the anchor to the start from the border
- anchor[0] - border_size.left,
- // move the anchor to include the right end border
- // Make sure the window width is a multiple of the step size
- anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
- steps[0]));
+ // move the anchor to the start from the border
+ anchor[0] - border_size.left,
+ // move the anchor to include the right end border
+ // Make sure the window width is a multiple of the step size
+ anchor[0] - border_size.left +
+ ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
+ steps[0]));
size_t n = 1;
- if(anchor.num_dimensions() > 1)
+ if (anchor.num_dimensions() > 1)
{
window.set(1, Window::Dimension(
- // Include the border above the image
- anchor[1] - border_size.top,
- // Include the border below the image
- anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
- steps[1]));
+ // Include the border above the image
+ anchor[1] - border_size.top,
+ // Include the border below the image
+ anchor[1] - border_size.top +
+ ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
+ steps[1]));
++n;
}
- if(anchor.num_dimensions() > 2)
+ if (anchor.num_dimensions() > 2)
{
window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[n]), steps[2]));
++n;
}
- for(; n < anchor.num_dimensions(); ++n)
+ for (; n < anchor.num_dimensions(); ++n)
{
window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
}
- for(; n < Coordinates::num_max_dimensions; ++n)
+ for (; n < Coordinates::num_max_dimensions; ++n)
{
window.set(n, Window::Dimension(0, 1));
}
@@ -179,9 +195,12 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
return window;
}
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+Window calculate_max_window_horizontal(const ValidRegion &valid_region,
+ const Steps &steps,
+ bool skip_border,
+ BorderSize border_size)
{
- if(skip_border)
+ if (skip_border)
{
border_size.top = 0;
border_size.bottom = 0;
@@ -198,33 +217,35 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St
Window window;
window.set(0, Window::Dimension(
- // Skip the border left of the image
- anchor[0] + border_size.left,
- // Skip the border right of the image
- // Make sure the window width is a multiple of the step size
- anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
- steps[0]));
+ // Skip the border left of the image
+ anchor[0] + border_size.left,
+ // Skip the border right of the image
+ // Make sure the window width is a multiple of the step size
+ anchor[0] + border_size.left +
+ ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+ static_cast<int>(border_size.right)),
+ steps[0]),
+ steps[0]));
size_t n = 1;
- if(anchor.num_dimensions() > 1)
+ if (anchor.num_dimensions() > 1)
{
window.set(1, Window::Dimension(
- // Skip the border above the image
- anchor[1] - border_size.top,
- // Skip the border below the image
- anchor[1] + shape[1] + border_size.bottom,
- 1));
+ // Skip the border above the image
+ anchor[1] - border_size.top,
+ // Skip the border below the image
+ anchor[1] + shape[1] + border_size.bottom, 1));
++n;
}
- for(; n < anchor.num_dimensions(); ++n)
+ for (; n < anchor.num_dimensions(); ++n)
{
window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
}
- for(; n < Coordinates::num_max_dimensions; ++n)
+ for (; n < Coordinates::num_max_dimensions; ++n)
{
window.set(n, Window::Dimension(0, 1));
}
@@ -247,9 +268,9 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
size_t squashed_bytes = src0.element_size();
// Try to squash the low dimensions together.
- for(; dim < num_dimensions; ++dim)
+ for (; dim < num_dimensions; ++dim)
{
- if(shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes)
+ if (shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes)
{
break;
}
@@ -257,7 +278,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
squashed_bytes *= shape0[dim];
}
- if(dim == num_dimensions)
+ if (dim == num_dimensions)
{
auto squashed_elements = squashed_bytes / src0.element_size();
@@ -266,7 +287,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
// The input tensors can be interpreted as 1D array.
win.set(0, Window::Dimension(0, squashed_elements, 1));
- for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+ for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
{
win.set(dim, Window::Dimension(0, 1, 1));
}
@@ -274,7 +295,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
else
{
// Generates the max window.
- for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+ for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
{
win.set(dim, Window::Dimension(0, std::max(shape0[dim], shape1[dim]), 1));
}
@@ -295,21 +316,21 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
size_t squashed_bytes = src.element_size();
// Try to squash the low dimensions together.
- for(; dim < num_dimensions; ++dim)
+ for (; dim < num_dimensions; ++dim)
{
- if(strides[dim] != squashed_bytes)
+ if (strides[dim] != squashed_bytes)
{
break;
}
squashed_bytes *= shape[dim];
}
- if(dim == num_dimensions)
+ if (dim == num_dimensions)
{
const auto squashed_elements = squashed_bytes / src.element_size();
split_dimension = Window::DimX;
// The input tensor can be interpreted as 1D array.
win.set(0, Window::Dimension(0, squashed_elements, 1));
- for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+ for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
{
win.set(dim, Window::Dimension(0, 1, 1));
}
@@ -317,7 +338,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
else
{
// Generate the max window.
- for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+ for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
{
win.set(dim, Window::Dimension(0, shape[dim], 1));
}
diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h
index eccf7f2d18..e404c18e8a 100644
--- a/src/core/helpers/WindowHelpers.h
+++ b/src/core/helpers/WindowHelpers.h
@@ -43,21 +43,13 @@ namespace arm_compute
* influence the returned value.
*/
template <typename... Ts>
-bool update_window_and_padding(Window &win, Ts &&... patterns)
+bool update_window_and_padding(Window &win, Ts &&...patterns)
{
bool window_changed = false;
- utility::for_each([&](const IAccessWindow & w)
- {
- window_changed |= w.update_window_if_needed(win);
- },
- patterns...);
+ utility::for_each([&](const IAccessWindow &w) { window_changed |= w.update_window_if_needed(win); }, patterns...);
- utility::for_each([&](IAccessWindow & w)
- {
- w.update_padding_if_needed(win);
- },
- patterns...);
+ utility::for_each([&](IAccessWindow &w) { w.update_padding_if_needed(win); }, patterns...);
return window_changed;
}
@@ -69,18 +61,18 @@ bool update_window_and_padding(Window &win, Ts &&... patterns)
* @return Intersection of all regions.
*/
template <typename... Ts>
-ValidRegion intersect_valid_regions(const Ts &... regions)
+ValidRegion intersect_valid_regions(const Ts &...regions)
{
- auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
+ auto intersect = [](const ValidRegion &r1, const ValidRegion &r2) -> ValidRegion
{
ValidRegion region;
- for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
+ for (size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
{
region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
}
- for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
+ for (size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
{
region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
}
@@ -101,7 +93,10 @@ ValidRegion intersect_valid_regions(const Ts &... regions)
*
* @return The maximum window the kernel can be executed on.
*/
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window(const ValidRegion &valid_region,
+ const Steps &steps = Steps(),
+ bool skip_border = false,
+ BorderSize border_size = BorderSize());
/** Calculate the maximum window for a given tensor shape and border setting
*
@@ -112,7 +107,10 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps
*
* @return The maximum window the kernel can be executed on.
*/
-Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window(const TensorShape &shape,
+ const Steps &steps = Steps(),
+ bool skip_border = false,
+ BorderSize border_size = BorderSize());
/** Calculate the maximum window for a given tensor shape and border setting
*
@@ -123,7 +121,10 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps
*
* @return The maximum window the kernel can be executed on.
*/
-inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+inline Window calculate_max_window(const ITensorInfo &info,
+ const Steps &steps = Steps(),
+ bool skip_border = false,
+ BorderSize border_size = BorderSize())
{
return calculate_max_window(info.tensor_shape(), steps, skip_border, border_size);
}
@@ -137,7 +138,10 @@ inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps =
*
* @return The maximum window the kernel can be executed on.
*/
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window_horizontal(const ValidRegion &valid_region,
+ const Steps &steps = Steps(),
+ bool skip_border = false,
+ BorderSize border_size = BorderSize());
/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
*
@@ -148,7 +152,10 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St
*
* @return The maximum window the kernel can be executed on.
*/
-inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+inline Window calculate_max_window_horizontal(const ITensorInfo &info,
+ const Steps &steps = Steps(),
+ bool skip_border = false,
+ BorderSize border_size = BorderSize())
{
return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size);
}
@@ -161,7 +168,9 @@ inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Ste
*
* @return The maximum window the kernel can be executed on.
*/
-Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
+Window calculate_max_enlarged_window(const ValidRegion &valid_region,
+ const Steps &steps = Steps(),
+ BorderSize border_size = BorderSize());
/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
*
@@ -171,7 +180,9 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
*
* @return The maximum window the kernel can be executed on.
*/
-inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize())
+inline Window calculate_max_enlarged_window(const ITensorInfo &info,
+ const Steps &steps = Steps(),
+ BorderSize border_size = BorderSize())
{
return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
}
@@ -208,7 +219,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
* @return A pair of the shape and window
*/
template <typename... Shapes>
-std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &... shapes)
+std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &...shapes)
{
const TensorShape out_shape = TensorShape::broadcast_shape(shapes...);
return std::make_pair(out_shape, calculate_max_window(out_shape));
diff --git a/src/core/utils/ActivationFunctionUtils.cpp b/src/core/utils/ActivationFunctionUtils.cpp
index 4854b8eb0b..017170a0c5 100644
--- a/src/core/utils/ActivationFunctionUtils.cpp
+++ b/src/core/utils/ActivationFunctionUtils.cpp
@@ -28,26 +28,24 @@
namespace arm_compute
{
-const std::string &string_from_activation_func(const ActivationFunction& act)
+const std::string &string_from_activation_func(const ActivationFunction &act)
{
- static std::map<ActivationFunction, const std::string> act_map =
- {
- { ActivationFunction::ABS, "ABS" },
- { ActivationFunction::LINEAR, "LINEAR" },
- { ActivationFunction::LOGISTIC, "LOGISTIC" },
- { ActivationFunction::RELU, "RELU" },
- { ActivationFunction::BOUNDED_RELU, "BRELU" },
- { ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU" },
- { ActivationFunction::LEAKY_RELU, "LRELU" },
- { ActivationFunction::SOFT_RELU, "SRELU" },
- { ActivationFunction::ELU, "ELU" },
- { ActivationFunction::SQRT, "SQRT" },
- { ActivationFunction::SQUARE, "SQUARE" },
- { ActivationFunction::TANH, "TANH" },
- { ActivationFunction::IDENTITY, "IDENTITY" },
- { ActivationFunction::HARD_SWISH, "HARD_SWISH" },
- { ActivationFunction::SWISH, "SWISH" },
- { ActivationFunction::GELU, "GELU" }
+ static std::map<ActivationFunction, const std::string> act_map = {{ActivationFunction::ABS, "ABS"},
+ {ActivationFunction::LINEAR, "LINEAR"},
+ {ActivationFunction::LOGISTIC, "LOGISTIC"},
+ {ActivationFunction::RELU, "RELU"},
+ {ActivationFunction::BOUNDED_RELU, "BRELU"},
+ {ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU"},
+ {ActivationFunction::LEAKY_RELU, "LRELU"},
+ {ActivationFunction::SOFT_RELU, "SRELU"},
+ {ActivationFunction::ELU, "ELU"},
+ {ActivationFunction::SQRT, "SQRT"},
+ {ActivationFunction::SQUARE, "SQUARE"},
+ {ActivationFunction::TANH, "TANH"},
+ {ActivationFunction::IDENTITY, "IDENTITY"},
+ {ActivationFunction::HARD_SWISH, "HARD_SWISH"},
+ {ActivationFunction::SWISH, "SWISH"},
+ {ActivationFunction::GELU, "GELU"}
};
diff --git a/src/core/utils/AssemblyUtils.cpp b/src/core/utils/AssemblyUtils.cpp
index 6d483adc7f..d97ea42091 100644
--- a/src/core/utils/AssemblyUtils.cpp
+++ b/src/core/utils/AssemblyUtils.cpp
@@ -34,12 +34,12 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
arm_gemm::Activation gemm_act;
// Early exit in case lower bound is other than 0, as it's not yet supported
- if(act.b() != 0.f)
+ if (act.b() != 0.f)
{
return gemm_act;
}
- switch(act.activation())
+ switch (act.activation())
{
case ActivationLayerInfo::ActivationFunction::RELU:
gemm_act.type = arm_gemm::Activation::Type::ReLU;
@@ -63,17 +63,15 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info)
{
- return arm_conv::PaddingValues{ pad_stride_info.pad_left(),
- pad_stride_info.pad_top(),
- pad_stride_info.pad_right(),
- pad_stride_info.pad_bottom() };
+ return arm_conv::PaddingValues{pad_stride_info.pad_left(), pad_stride_info.pad_top(), pad_stride_info.pad_right(),
+ pad_stride_info.pad_bottom()};
}
arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFormat &weight_format)
{
arm_gemm::WeightFormat gemm_weight_fromat;
- switch(weight_format)
+ switch (weight_format)
{
case arm_compute::WeightFormat::UNSPECIFIED:
gemm_weight_fromat = arm_gemm::WeightFormat::UNSPECIFIED;
@@ -193,7 +191,7 @@ arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::Weigh
{
arm_compute::WeightFormat acl_weight_fromat;
- switch(weight_format)
+ switch (weight_format)
{
case arm_gemm::WeightFormat::UNSPECIFIED:
acl_weight_fromat = arm_compute::WeightFormat::UNSPECIFIED;
diff --git a/src/core/utils/AssemblyUtils.h b/src/core/utils/AssemblyUtils.h
index 60bad3b618..7d0d37c4ef 100644
--- a/src/core/utils/AssemblyUtils.h
+++ b/src/core/utils/AssemblyUtils.h
@@ -25,6 +25,7 @@
#define UTILS_CORE_ASSEMBLY_UTILS_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/kernels/assembly/common.hpp"
#include "src/cpu/kernels/assembly/arm_gemm.hpp"
@@ -65,6 +66,6 @@ arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFo
* @return Compute Library WeightFormat
*/
arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::WeightFormat &weight_format);
-} // namespace assembly
+} // namespace assembly_utils
} // namespace arm_compute
#endif /* UTILS_CORE_ASSEMBLY_UTILS_H */
diff --git a/src/core/utils/DataLayoutUtils.cpp b/src/core/utils/DataLayoutUtils.cpp
index 4919b79a42..234bed71cb 100644
--- a/src/core/utils/DataLayoutUtils.cpp
+++ b/src/core/utils/DataLayoutUtils.cpp
@@ -29,11 +29,10 @@ namespace arm_compute
const std::string &string_from_data_layout(DataLayout dl)
{
- static std::map<DataLayout, const std::string> dl_map =
- {
- { DataLayout::UNKNOWN, "UNKNOWN" },
- { DataLayout::NCHW, "NCHW" },
- { DataLayout::NHWC, "NHWC" },
+ static std::map<DataLayout, const std::string> dl_map = {
+ {DataLayout::UNKNOWN, "UNKNOWN"},
+ {DataLayout::NCHW, "NCHW"},
+ {DataLayout::NHWC, "NHWC"},
};
return dl_map[dl];
diff --git a/src/core/utils/DataTypeUtils.cpp b/src/core/utils/DataTypeUtils.cpp
index 07999354d9..1394339987 100644
--- a/src/core/utils/DataTypeUtils.cpp
+++ b/src/core/utils/DataTypeUtils.cpp
@@ -30,27 +30,26 @@ namespace arm_compute
{
const std::string &string_from_data_type(DataType dt)
{
- static std::map<DataType, const std::string> dt_map =
- {
- { DataType::UNKNOWN, "UNKNOWN" },
- { DataType::S8, "S8" },
- { DataType::U8, "U8" },
- { DataType::S16, "S16" },
- { DataType::U16, "U16" },
- { DataType::S32, "S32" },
- { DataType::U32, "U32" },
- { DataType::S64, "S64" },
- { DataType::U64, "U64" },
- { DataType::F16, "F16" },
- { DataType::F32, "F32" },
- { DataType::F64, "F64" },
- { DataType::SIZET, "SIZET" },
- { DataType::QSYMM8, "QSYMM8" },
- { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" },
- { DataType::QASYMM8, "QASYMM8" },
- { DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED" },
- { DataType::QSYMM16, "QSYMM16" },
- { DataType::QASYMM16, "QASYMM16" },
+ static std::map<DataType, const std::string> dt_map = {
+ {DataType::UNKNOWN, "UNKNOWN"},
+ {DataType::S8, "S8"},
+ {DataType::U8, "U8"},
+ {DataType::S16, "S16"},
+ {DataType::U16, "U16"},
+ {DataType::S32, "S32"},
+ {DataType::U32, "U32"},
+ {DataType::S64, "S64"},
+ {DataType::U64, "U64"},
+ {DataType::F16, "F16"},
+ {DataType::F32, "F32"},
+ {DataType::F64, "F64"},
+ {DataType::SIZET, "SIZET"},
+ {DataType::QSYMM8, "QSYMM8"},
+ {DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL"},
+ {DataType::QASYMM8, "QASYMM8"},
+ {DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED"},
+ {DataType::QSYMM16, "QSYMM16"},
+ {DataType::QASYMM16, "QASYMM16"},
};
return dt_map[dt];
@@ -58,12 +57,11 @@ const std::string &string_from_data_type(DataType dt)
DataType data_type_from_name(const std::string &name)
{
- static const std::map<std::string, DataType> data_types =
- {
- { "f16", DataType::F16 },
- { "f32", DataType::F32 },
- { "qasymm8", DataType::QASYMM8 },
- { "qasymm8_signed", DataType::QASYMM8_SIGNED },
+ static const std::map<std::string, DataType> data_types = {
+ {"f16", DataType::F16},
+ {"f32", DataType::F32},
+ {"qasymm8", DataType::QASYMM8},
+ {"qasymm8_signed", DataType::QASYMM8_SIGNED},
};
#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -74,7 +72,7 @@ DataType data_type_from_name(const std::string &name)
#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
- catch(const std::out_of_range &)
+ catch (const std::out_of_range &)
{
ARM_COMPUTE_ERROR_VAR("Invalid data type name: %s", name.c_str());
}
diff --git a/src/core/utils/FormatUtils.cpp b/src/core/utils/FormatUtils.cpp
index 05b649ded2..46f8455315 100644
--- a/src/core/utils/FormatUtils.cpp
+++ b/src/core/utils/FormatUtils.cpp
@@ -30,26 +30,16 @@ namespace arm_compute
{
const std::string &string_from_format(Format format)
{
- static std::map<Format, const std::string> formats_map =
- {
- { Format::UNKNOWN, "UNKNOWN" },
- { Format::U8, "U8" },
- { Format::S16, "S16" },
- { Format::U16, "U16" },
- { Format::S32, "S32" },
- { Format::U32, "U32" },
- { Format::F16, "F16" },
- { Format::F32, "F32" },
- { Format::UV88, "UV88" },
- { Format::RGB888, "RGB888" },
- { Format::RGBA8888, "RGBA8888" },
- { Format::YUV444, "YUV444" },
- { Format::YUYV422, "YUYV422" },
- { Format::NV12, "NV12" },
- { Format::NV21, "NV21" },
- { Format::IYUV, "IYUV" },
- { Format::UYVY422, "UYVY422" }
- };
+ static std::map<Format, const std::string> formats_map = {
+ {Format::UNKNOWN, "UNKNOWN"}, {Format::U8, "U8"},
+ {Format::S16, "S16"}, {Format::U16, "U16"},
+ {Format::S32, "S32"}, {Format::U32, "U32"},
+ {Format::F16, "F16"}, {Format::F32, "F32"},
+ {Format::UV88, "UV88"}, {Format::RGB888, "RGB888"},
+ {Format::RGBA8888, "RGBA8888"}, {Format::YUV444, "YUV444"},
+ {Format::YUYV422, "YUYV422"}, {Format::NV12, "NV12"},
+ {Format::NV21, "NV21"}, {Format::IYUV, "IYUV"},
+ {Format::UYVY422, "UYVY422"}};
return formats_map[format];
}
diff --git a/src/core/utils/InterpolationPolicyUtils.cpp b/src/core/utils/InterpolationPolicyUtils.cpp
index 2d6cabe85e..276e760544 100644
--- a/src/core/utils/InterpolationPolicyUtils.cpp
+++ b/src/core/utils/InterpolationPolicyUtils.cpp
@@ -29,11 +29,10 @@ namespace arm_compute
const std::string &string_from_interpolation_policy(InterpolationPolicy policy)
{
- static std::map<InterpolationPolicy, const std::string> interpolation_policy_map =
- {
- { InterpolationPolicy::AREA, "AREA" },
- { InterpolationPolicy::BILINEAR, "BILINEAR" },
- { InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR" },
+ static std::map<InterpolationPolicy, const std::string> interpolation_policy_map = {
+ {InterpolationPolicy::AREA, "AREA"},
+ {InterpolationPolicy::BILINEAR, "BILINEAR"},
+ {InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR"},
};
return interpolation_policy_map[policy];
diff --git a/src/core/utils/ScaleUtils.cpp b/src/core/utils/ScaleUtils.cpp
index ee57a8e7a7..a92da39b67 100644
--- a/src/core/utils/ScaleUtils.cpp
+++ b/src/core/utils/ScaleUtils.cpp
@@ -23,11 +23,12 @@
*/
#include "src/core/utils/ScaleUtils.h"
-#include "src/common/cpuinfo/CpuIsaInfo.h"
#include "arm_compute/core/CPP/CPPTypes.h"
#include "arm_compute/core/TensorInfo.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+
float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners)
{
const size_t offset = (align_corners && output_size > 1) ? 1 : 0;
@@ -40,13 +41,15 @@ float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t
return static_cast<float>(in) / static_cast<float>(out);
}
-bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout, DataType data_type,
- InterpolationPolicy policy, BorderMode border_mode)
+bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout,
+ DataType data_type,
+ InterpolationPolicy policy,
+ BorderMode border_mode)
{
// Do not calculate precomputed weights and indices if kernel code doesn't use them
- if(data_layout == DataLayout::NHWC)
+ if (data_layout == DataLayout::NHWC)
{
- switch(data_type)
+ switch (data_type)
{
case DataType::F32:
case DataType::F16:
@@ -62,4 +65,4 @@ bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout
}
return true;
-} \ No newline at end of file
+}
diff --git a/src/core/utils/ScaleUtils.h b/src/core/utils/ScaleUtils.h
index 1484824a7f..d8dddc8c70 100644
--- a/src/core/utils/ScaleUtils.h
+++ b/src/core/utils/ScaleUtils.h
@@ -60,8 +60,11 @@ inline bool is_align_corners_allowed_sampling_policy(SamplingPolicy sampling_pol
*
* @return True if precomputation is required
*/
-bool is_precomputation_required(DataLayout data_layout, DataType data_type, InterpolationPolicy policy, BorderMode border_mode);
+bool is_precomputation_required(DataLayout data_layout,
+ DataType data_type,
+ InterpolationPolicy policy,
+ BorderMode border_mode);
} // namespace scale_utils
} // namespace arm_compute
-#endif /* UTILS_CORE_SCALEUTILS_H */ \ No newline at end of file
+#endif /* UTILS_CORE_SCALEUTILS_H */
diff --git a/src/core/utils/StringUtils.cpp b/src/core/utils/StringUtils.cpp
index 6d05c9b64e..bcab0ce10c 100644
--- a/src/core/utils/StringUtils.cpp
+++ b/src/core/utils/StringUtils.cpp
@@ -55,7 +55,7 @@ std::string float_to_string_with_full_precision(float val)
ss.precision(std::numeric_limits<float>::max_digits10);
ss << val;
- if(val != static_cast<int>(val))
+ if (val != static_cast<int>(val))
{
ss << "f";
}
@@ -65,17 +65,11 @@ std::string float_to_string_with_full_precision(float val)
std::string join(const std::vector<std::string> strings, const std::string &sep)
{
- if(strings.empty())
+ if (strings.empty())
{
return "";
}
- return std::accumulate(
- std::next(strings.begin()),
- strings.end(),
- strings.at(0),
- [&sep](const std::string & a, const std::string & b)
- {
- return a + sep + b;
- });
-}
+ return std::accumulate(std::next(strings.begin()), strings.end(), strings.at(0),
+ [&sep](const std::string &a, const std::string &b) { return a + sep + b; });
}
+} // namespace arm_compute
diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp
index 64633c643d..edc8d0eacc 100644
--- a/src/core/utils/helpers/fft.cpp
+++ b/src/core/utils/helpers/fft.cpp
@@ -37,7 +37,7 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
unsigned int res = N;
// Early exit if no supported factors are provided
- if(supported_factors.empty())
+ if (supported_factors.empty())
{
return stages;
}
@@ -46,10 +46,10 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
auto rfactor_it = supported_factors.rbegin();
// Decomposition step
- while(res != 0)
+ while (res != 0)
{
const unsigned int factor = *rfactor_it;
- if(0 == (res % factor) && res >= factor)
+ if (0 == (res % factor) && res >= factor)
{
stages.push_back(factor);
res /= factor;
@@ -57,9 +57,9 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
else
{
++rfactor_it;
- if(rfactor_it == supported_factors.rend())
+ if (rfactor_it == supported_factors.rend())
{
- if(res > 1)
+ if (res > 1)
{
// Couldn't decompose with given factors
stages.clear();
@@ -81,8 +81,9 @@ std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vecto
std::vector<unsigned int> idx_digit_reverse;
// Early exit in case N and fft stages do not match
- const float stages_prod = std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
- if(stages_prod != N)
+ const float stages_prod =
+ std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
+ if (stages_prod != N)
{
return idx_digit_reverse;
}
@@ -94,13 +95,13 @@ std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vecto
unsigned int n_stages = fft_stages.size();
// Scan elements
- for(unsigned int n = 0; n < N; ++n)
+ for (unsigned int n = 0; n < N; ++n)
{
unsigned int k = n;
unsigned int Nx = fft_stages[0];
// Scan stages
- for(unsigned int s = 1; s < n_stages; ++s)
+ for (unsigned int s = 1; s < n_stages; ++s)
{
// radix of stage i-th
unsigned int Ny = fft_stages[s];
diff --git a/src/core/utils/helpers/float_ops.h b/src/core/utils/helpers/float_ops.h
index 99e1ea54ee..7f7fbd13bf 100644
--- a/src/core/utils/helpers/float_ops.h
+++ b/src/core/utils/helpers/float_ops.h
@@ -39,8 +39,7 @@ union RawFloat
*
* @param[in] val Floating-point value
*/
- explicit RawFloat(float val)
- : f32(val)
+ explicit RawFloat(float val) : f32(val)
{
}
/** Extract sign of floating point number
diff --git a/src/core/utils/helpers/tensor_info.h b/src/core/utils/helpers/tensor_info.h
index 9279532e2a..fd4745a453 100644
--- a/src/core/utils/helpers/tensor_info.h
+++ b/src/core/utils/helpers/tensor_info.h
@@ -41,15 +41,17 @@ namespace tensor_info
* @return True if tensors have mismatching quantization info else false.
*/
template <typename... Ts>
-inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1,
+ const ITensorInfo *tensor_info_2,
+ Ts... tensor_infos)
{
const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
- const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
- return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
- {
- return tensor_info->quantization_info() != first_quantization_info;
- });
+ const std::array<const ITensorInfo *, 1 + sizeof...(Ts)> tensor_infos_array{
+ {tensor_info_2, std::forward<Ts>(tensor_infos)...}};
+ return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+ [&](const ITensorInfo *tensor_info)
+ { return tensor_info->quantization_info() != first_quantization_info; });
}
} // namespace tensor_info
} // namespace helpers
diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index f2216995a9..19d0badd74 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp
@@ -36,10 +36,11 @@ int calculate_stride_on_index(int index, Coordinates strides)
return index >= static_cast<int>(strides.num_dimensions()) ? 1 : strides[index];
}
-int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
+int calculate_start_on_index(
+ TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
{
// Early exit
- if(index >= static_cast<int>(starts.num_dimensions()))
+ if (index >= static_cast<int>(starts.num_dimensions()))
{
return 0;
}
@@ -51,14 +52,14 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
int start = starts[index];
// Reset in case of begin mask present
- if(arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
+ if (arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
{
start = stride > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max();
}
// Account negative start points
const int dim_size = input_shape[index];
- if(start < 0)
+ if (start < 0)
{
start += dim_size;
}
@@ -69,12 +70,16 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
return start;
}
-int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index,
- Coordinates ends, Coordinates strides,
- int32_t end_mask, int32_t shrink_axis_mask)
+int calculate_end_on_index(TensorShape input_shape,
+ int index,
+ int start_on_index,
+ Coordinates ends,
+ Coordinates strides,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
// Early exit
- if(index >= static_cast<int>(ends.num_dimensions()))
+ if (index >= static_cast<int>(ends.num_dimensions()))
{
return input_shape[index];
}
@@ -86,9 +91,9 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
int stop = ends[index];
// Shrink dimension
- if(shrink_axis)
+ if (shrink_axis)
{
- if(start_on_index == std::numeric_limits<int>::max())
+ if (start_on_index == std::numeric_limits<int>::max())
{
stop = start_on_index;
}
@@ -99,14 +104,14 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
}
// Reset in case of begin mask present
- if(arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
+ if (arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
{
stop = (stride > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest();
}
// Account negative end points
const int dim_size = input_shape[index];
- if(stop < 0)
+ if (stop < 0)
{
stop += dim_size;
}
@@ -118,14 +123,18 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
}
std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape,
- Coordinates starts, Coordinates ends, Coordinates strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+ Coordinates starts,
+ Coordinates ends,
+ Coordinates strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
Coordinates starts_abs{};
Coordinates ends_abs{};
Coordinates final_strides{};
- for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
{
const int start_i = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
starts_abs.set(i, start_i);
@@ -136,13 +145,19 @@ std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords
return std::make_tuple(starts_abs, ends_abs, final_strides);
}
-TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask, bool return_unshrinked)
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape,
+ Coordinates starts,
+ Coordinates ends,
+ Coordinates strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask,
+ bool return_unshrinked)
{
unsigned int index = 0;
TensorShape output_shape;
- for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
{
const int stride = calculate_stride_on_index(index, strides);
const int start = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
@@ -150,11 +165,11 @@ TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordina
const int range = end - start;
const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
- if(return_unshrinked || !is_shrink)
+ if (return_unshrinked || !is_shrink)
{
- if((range == 0) || // Zero range
- (range < 0 && stride >= 0) || // Negative range with positive stride
- (range > 0 && stride <= 0)) // Positive range with negative stride
+ if ((range == 0) || // Zero range
+ (range < 0 && stride >= 0) || // Negative range with positive stride
+ (range > 0 && stride <= 0)) // Positive range with negative stride
{
output_shape.set(index, 0);
return output_shape;
@@ -173,9 +188,9 @@ int32_t construct_slice_end_mask(Coordinates ends)
{
// Create end mask
int32_t end_mask = 0;
- for(unsigned int i = 0; i < ends.num_dimensions(); ++i)
+ for (unsigned int i = 0; i < ends.num_dimensions(); ++i)
{
- if(ends[i] < 0)
+ if (ends[i] < 0)
{
end_mask |= 1 << i;
}
diff --git a/src/core/utils/io/FileHandler.cpp b/src/core/utils/io/FileHandler.cpp
index 95fc2e3fa2..d106493238 100644
--- a/src/core/utils/io/FileHandler.cpp
+++ b/src/core/utils/io/FileHandler.cpp
@@ -21,16 +21,15 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include <string>
-
#include "arm_compute/core/utils/io/FileHandler.h"
#include "arm_compute/core/Error.h"
+#include <string>
+
using namespace arm_compute::io;
-FileHandler::FileHandler()
- : _filestream(), _filename(" "), _mode()
+FileHandler::FileHandler() : _filestream(), _filename(" "), _mode()
{
}
diff --git a/src/core/utils/logging/FilePrinter.cpp b/src/core/utils/logging/FilePrinter.cpp
index 55e78f9630..7b4eead38d 100644
--- a/src/core/utils/logging/FilePrinter.cpp
+++ b/src/core/utils/logging/FilePrinter.cpp
@@ -25,8 +25,7 @@
using namespace arm_compute::logging;
-FilePrinter::FilePrinter(const std::string &filename)
- : _handler()
+FilePrinter::FilePrinter(const std::string &filename) : _handler()
{
_handler.open(filename, std::fstream::out | std::fstream::trunc);
}
@@ -34,4 +33,4 @@ FilePrinter::FilePrinter(const std::string &filename)
void FilePrinter::print_internal(const std::string &msg)
{
_handler.stream() << msg << std::endl;
-} \ No newline at end of file
+}
diff --git a/src/core/utils/logging/Helpers.cpp b/src/core/utils/logging/Helpers.cpp
index c3df7f6207..14ad910562 100644
--- a/src/core/utils/logging/Helpers.cpp
+++ b/src/core/utils/logging/Helpers.cpp
@@ -30,13 +30,12 @@ using namespace arm_compute::logging;
const std::string &arm_compute::logging::string_from_log_level(LogLevel log_level)
{
- static std::map<LogLevel, const std::string> log_level_map =
- {
- { LogLevel::VERBOSE, "VERBOSE" },
- { LogLevel::INFO, "INFO" },
- { LogLevel::WARN, "WARN" },
- { LogLevel::OFF, "OFF" },
+ static std::map<LogLevel, const std::string> log_level_map = {
+ {LogLevel::VERBOSE, "VERBOSE"},
+ {LogLevel::INFO, "INFO"},
+ {LogLevel::WARN, "WARN"},
+ {LogLevel::OFF, "OFF"},
};
return log_level_map[log_level];
-} \ No newline at end of file
+}
diff --git a/src/core/utils/logging/Logger.cpp b/src/core/utils/logging/Logger.cpp
index 70b5868da8..d6681f8179 100644
--- a/src/core/utils/logging/Logger.cpp
+++ b/src/core/utils/logging/Logger.cpp
@@ -30,10 +30,7 @@
using namespace arm_compute::logging;
Logger::Logger(std::string name, LogLevel log_level, std::shared_ptr<Printer> printer)
- : _name(std::move(name)), _log_level(log_level), _printers(
-{
- std::move(printer)
-}), _decorators()
+ : _name(std::move(name)), _log_level(log_level), _printers({std::move(printer)}), _decorators()
{
// Check printer
ARM_COMPUTE_ERROR_ON(printer == nullptr);
@@ -46,7 +43,7 @@ Logger::Logger(std::string name, LogLevel log_level, std::vector<std::shared_ptr
: _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators()
{
// Check printers
- for(const auto &p : _printers)
+ for (const auto &p : _printers)
{
ARM_COMPUTE_UNUSED(p);
ARM_COMPUTE_ERROR_ON(p == nullptr);
@@ -62,13 +59,13 @@ Logger::Logger(std::string name,
: _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators(std::move(decorators))
{
// Check printers
- for(const auto &p : _printers)
+ for (const auto &p : _printers)
{
ARM_COMPUTE_UNUSED(p);
ARM_COMPUTE_ERROR_ON(p == nullptr);
}
// Check decorators
- for(const auto &d : _decorators)
+ for (const auto &d : _decorators)
{
ARM_COMPUTE_UNUSED(d);
ARM_COMPUTE_ERROR_ON(d == nullptr);
@@ -79,7 +76,7 @@ void Logger::log(LogLevel log_level, const std::string &msg)
{
// Return if message shouldn't be logged
// i.e. if log level does not match the logger's
- if(!is_loggable(log_level))
+ if (!is_loggable(log_level))
{
return;
}
@@ -129,7 +126,7 @@ bool Logger::is_loggable(LogLevel log_level)
void Logger::decorate_log_msg(LogMsg &msg)
{
- for(const auto &d : _decorators)
+ for (const auto &d : _decorators)
{
d->decorate(msg);
}
@@ -148,7 +145,7 @@ std::string Logger::create_log_msg(const std::string &str, LogLevel log_level)
void Logger::print_all(const std::string &msg)
{
- for(auto &p : _printers)
+ for (auto &p : _printers)
{
p->print(msg);
}
diff --git a/src/core/utils/logging/LoggerRegistry.cpp b/src/core/utils/logging/LoggerRegistry.cpp
index c281d8863c..17015d9ae9 100644
--- a/src/core/utils/logging/LoggerRegistry.cpp
+++ b/src/core/utils/logging/LoggerRegistry.cpp
@@ -24,15 +24,15 @@
#include "arm_compute/core/utils/logging/LoggerRegistry.h"
#include "arm_compute/core/Error.h"
+
#include "support/Mutex.h"
using namespace arm_compute::logging;
/** Reserved logger used by the library */
-std::set<std::string> LoggerRegistry::_reserved_loggers = { "CORE", "RUNTIME", "GRAPH" };
+std::set<std::string> LoggerRegistry::_reserved_loggers = {"CORE", "RUNTIME", "GRAPH"};
-LoggerRegistry::LoggerRegistry()
- : _mtx(), _loggers()
+LoggerRegistry::LoggerRegistry() : _mtx(), _loggers()
{
}
@@ -42,10 +42,12 @@ LoggerRegistry &LoggerRegistry::get()
return _instance;
}
-void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
+void LoggerRegistry::create_logger(const std::string &name,
+ LogLevel log_level,
+ const std::vector<std::shared_ptr<Printer>> &printers)
{
arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
- if((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
+ if ((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
{
_loggers[name] = std::make_shared<Logger>(name, log_level, printers);
}
@@ -54,7 +56,7 @@ void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level,
void LoggerRegistry::remove_logger(const std::string &name)
{
arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
- if(_loggers.find(name) != _loggers.end())
+ if (_loggers.find(name) != _loggers.end())
{
_loggers.erase(name);
}
@@ -69,9 +71,9 @@ std::shared_ptr<Logger> LoggerRegistry::logger(const std::string &name)
void LoggerRegistry::create_reserved_loggers(LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
{
arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
- for(const auto &r : _reserved_loggers)
+ for (const auto &r : _reserved_loggers)
{
- if(_loggers.find(r) == _loggers.end())
+ if (_loggers.find(r) == _loggers.end())
{
_loggers[r] = std::make_shared<Logger>(r, log_level, printers);
}
diff --git a/src/core/utils/misc/MMappedFile.cpp b/src/core/utils/misc/MMappedFile.cpp
index adae8a2bf0..a467cb3320 100644
--- a/src/core/utils/misc/MMappedFile.cpp
+++ b/src/core/utils/misc/MMappedFile.cpp
@@ -27,12 +27,11 @@
#include <cstdio>
#include <cstring>
-#include <tuple>
-
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
+#include <tuple>
#include <unistd.h>
namespace arm_compute
@@ -53,7 +52,7 @@ std::pair<size_t, bool> get_file_size(const std::string &filename)
{
struct stat st; // NOLINT
memset(&st, 0, sizeof(struct stat));
- if(stat(filename.c_str(), &st) == 0)
+ if (stat(filename.c_str(), &st) == 0)
{
return std::make_pair(st.st_size, true);
}
@@ -73,8 +72,7 @@ size_t get_page_size()
}
} // namespace
-MMappedFile::MMappedFile()
- : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr)
+MMappedFile::MMappedFile() : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr)
{
}
@@ -92,14 +90,14 @@ MMappedFile::~MMappedFile()
bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
{
// Check if file is mapped
- if(is_mapped())
+ if (is_mapped())
{
return false;
}
// Open file
_fp = fopen(filename.c_str(), "a+be");
- if(_fp == nullptr)
+ if (_fp == nullptr)
{
return false;
}
@@ -107,26 +105,26 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
// Extract file descriptor
int fd = fileno(_fp);
bool status = fd >= 0;
- if(status)
+ if (status)
{
// Get file size
std::tie(_file_size, status) = get_file_size(_filename);
- if(status)
+ if (status)
{
// Map all file from offset if map size is 0
_map_size = (size == 0) ? _file_size : size;
_map_offset = offset;
// Check offset mapping
- if((_map_offset > _file_size) || (_map_offset % get_page_size() != 0))
+ if ((_map_offset > _file_size) || (_map_offset % get_page_size() != 0))
{
status = false;
}
else
{
// Truncate to file size
- if(_map_offset + _map_size > _file_size)
+ if (_map_offset + _map_size > _file_size)
{
_map_size = _file_size - _map_offset;
}
@@ -137,7 +135,7 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
}
}
- if(!status)
+ if (!status)
{
fclose(_fp);
}
@@ -148,14 +146,14 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
void MMappedFile::release()
{
// Unmap file
- if(_data != nullptr)
+ if (_data != nullptr)
{
::munmap(_data, _file_size);
_data = nullptr;
}
// Close file
- if(_fp != nullptr)
+ if (_fp != nullptr)
{
fclose(_fp);
_fp = nullptr;
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 086d63b968..f66d3e7064 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
#include "arm_compute/core/Helpers.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/utils/quantization/AsymmHelpers.h"
#include "support/ToolchainSupport.h"
@@ -40,7 +42,7 @@ constexpr float epsilon = 0.00001f;
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon)
{
- if(multiplier >= 1.f)
+ if (multiplier >= 1.f)
{
Status status = calculate_quantized_multiplier_greater_than_one(multiplier, quant_multiplier, shift);
*shift *= -1;
@@ -69,13 +71,13 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier,
*right_shift = -1 * shift_exp;
auto q_fixed = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0));
ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
- if(q_fixed == fixed_point_one_Q0)
+ if (q_fixed == fixed_point_one_Q0)
{
q_fixed /= 2;
--*right_shift;
}
- if(ignore_epsilon && *right_shift > 31)
+ if (ignore_epsilon && *right_shift > 31)
{
*right_shift = 0;
q_fixed = 0;
@@ -88,9 +90,8 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier,
return Status{};
}
-Status calculate_quantized_multiplier_greater_than_one(float multiplier,
- int32_t *quantized_multiplier,
- int32_t *left_shift)
+Status
+calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift)
{
ARM_COMPUTE_RETURN_ERROR_ON(quantized_multiplier == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(left_shift == nullptr);
@@ -101,7 +102,7 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier,
*left_shift = shift_exp;
auto q_fixed = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0));
ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
- if(q_fixed == fixed_point_one_Q0)
+ if (q_fixed == fixed_point_one_Q0)
{
q_fixed /= 2;
++*left_shift;
@@ -113,9 +114,9 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier,
return Status{};
}
-arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
- const QuantizationInfo &wq_info,
- const QuantizationInfo &oq_info,
+arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
+ const QuantizationInfo &wq_info,
+ const QuantizationInfo &oq_info,
GEMMLowpOutputStageInfo &stage_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty());
@@ -133,7 +134,7 @@ arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_i
const float i_scale = iq_info.scale().at(0);
const float o_scale = oq_info.scale().at(0);
- for(unsigned int i = 0; i < size; ++i)
+ for (unsigned int i = 0; i < size; ++i)
{
const float multiplier = i_scale * w_scales[i] / o_scale;
int32_t quant_multiplier = 0;
@@ -154,7 +155,7 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
{
int min_quant_val = 0;
int max_quant_val = 0;
- switch(data_type)
+ switch (data_type)
{
case DataType::QASYMM8:
min_quant_val = std::numeric_limits<uint8_t>::min();
@@ -179,7 +180,9 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
return std::make_pair(min_quant_val, max_quant_val);
}
-std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type)
+std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info,
+ const ActivationLayerInfo &act_info,
+ DataType data_type)
{
ARM_COMPUTE_ERROR_ON(data_type != DataType::QASYMM8 && data_type != DataType::QASYMM8_SIGNED);
@@ -190,20 +193,23 @@ std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const Quant
const UniformQuantizationInfo q_unif = q_info.uniform();
- if(act_info.enabled())
+ if (act_info.enabled())
{
- switch(act_info.activation())
+ switch (act_info.activation())
{
case ActivationLayerInfo::ActivationFunction::RELU:
type_min = q_unif.offset;
break;
case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
type_min = q_unif.offset;
- type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info);
+ type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info)
+ : quantize_qasymm8_signed(act_info.a(), q_info);
break;
case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info) : quantize_qasymm8_signed(act_info.b(), q_info);
- type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info);
+ type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info)
+ : quantize_qasymm8_signed(act_info.b(), q_info);
+ type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info)
+ : quantize_qasymm8_signed(act_info.a(), q_info);
break;
default:
ARM_COMPUTE_ERROR("Activation function not supported.");
@@ -226,7 +232,7 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
const unsigned int num_filters = wq_info.scale().size();
- for(unsigned int i = 0; i < num_filters; ++i)
+ for (unsigned int i = 0; i < num_filters; ++i)
{
int32_t output_multiplier = 0;
int32_t output_shift = 0;
@@ -267,11 +273,11 @@ int32_t multiply_by_quantized_multiplier(int32_t input, int32_t qmul, int32_t sh
int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
{
- if(exponent == 0)
+ if (exponent == 0)
{
return v;
}
- else if(exponent < 0)
+ else if (exponent < 0)
{
return rounding_divide_by_pow2(v, -exponent);
}
@@ -291,11 +297,14 @@ int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
}
}
-void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift)
+void get_invsqrt_quantized_multiplier_exp(int32_t input,
+ int32_t reverse_shift,
+ int32_t &output_inv_sqrt,
+ int32_t &output_shift)
{
ARM_COMPUTE_ERROR_ON(input < 0);
- if(input <= 1)
+ if (input <= 1)
{
// dealing the inputs (0 and 1) separately to avoid overflow
output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
@@ -305,7 +314,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
// prepare input for fixed point operation and compute shift value
output_shift = 11;
- while(input >= (1 << 29))
+ while (input >= (1 << 29))
{
input /= 4;
++output_shift;
@@ -334,9 +343,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
// multiplication of two fixed point numbers, defined for readability
auto fixed_point_mul = [](FixedPointRawType a, FixedPointRawType b) -> FixedPointRawType
- {
- return saturating_rounding_doubling_highmul(a, b);
- };
+ { return saturating_rounding_doubling_highmul(a, b); };
// rescaling of fixed point to have dst_bit integer bits, defined for readability
auto fixed_point_rescale = [](FixedPointRawType a, uint32_t src_bit, uint32_t dst_bit) -> FixedPointRawType
@@ -347,17 +354,18 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
// 5 iterations of Newton-Raphson method for inverse square root - 1.5 * x_n = input/2 * (x_n)^3
constexpr int32_t num_iteration = 5;
- for(int32_t i = 0; i < num_iteration; ++i)
+ for (int32_t i = 0; i < num_iteration; ++i)
{
const auto x3 = fixed_point_rescale(fixed_point_mul(fixed_point_mul(x, x), x), 9, fixedpoint_position);
- x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), 6, fixedpoint_position);
+ x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3),
+ 6, fixedpoint_position);
}
// fixed point representation of sqrt(1/2)
const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
x = fixed_point_mul(fixedpoint_half_sqrt_2, x);
output_inv_sqrt = x;
- if(output_shift < 0)
+ if (output_shift < 0)
{
output_inv_sqrt <<= -output_shift;
output_shift = 0;
@@ -365,5 +373,5 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
// convert right shift to left shift
output_shift *= reverse_shift;
}
-} // quantization
-} // arm_compute
+} // namespace quantization
+} // namespace arm_compute
diff --git a/src/core/utils/quantization/AsymmHelpers.h b/src/core/utils/quantization/AsymmHelpers.h
index f9701095cb..5dc607ce58 100644
--- a/src/core/utils/quantization/AsymmHelpers.h
+++ b/src/core/utils/quantization/AsymmHelpers.h
@@ -29,7 +29,8 @@
namespace arm_compute
{
-namespace quantization {
+namespace quantization
+{
/** Get minimum and maximum output of the activation function after quantization.
*
@@ -41,7 +42,9 @@ namespace quantization {
*
* @return The minimum and maximum output of the activation function after quantization.
*/
-std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type);
+std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info,
+ const ActivationLayerInfo &act_info,
+ DataType data_type);
} // namespace quantization
} // namespace arm_compute