From ddb93bbf12fc9d685e7ddbef703a886d67cbda9b Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 2 Oct 2020 16:38:59 +0100 Subject: COMPMID-3637: Move wrapper to src Signed-off-by: Georgios Pinitas Change-Id: I524b0c4b49c7a7035b7d078b9585d77b0d438e10 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4083 Reviewed-by: Michele Di Giorgio Reviewed-by: Michalis Spyrou Comments-Addressed: Arm Jenkins --- arm_compute/core/NEON/NEAsymm.h | 753 -------------- arm_compute/core/NEON/NEAsymm.inl | 92 -- arm_compute/core/NEON/NEColorConvertHelper.inl | 1045 -------------------- arm_compute/core/NEON/NEFixedPoint.h | 41 - arm_compute/core/NEON/NEFixedPoint.inl | 43 - arm_compute/core/NEON/NEMath.h | 307 ------ arm_compute/core/NEON/NEMath.inl | 529 ---------- arm_compute/core/NEON/NESymm.h | 256 ----- .../kernels/detail/NEActivationFunctionDetail.h | 315 ------ .../NEON/kernels/detail/NEDirectConvolution3x3.h | 170 ---- .../kernels/detail/NEDirectConvolutionDetail.h | 965 ------------------ arm_compute/core/NEON/wrapper/intrinsics/abs.h | 75 -- arm_compute/core/NEON/wrapper/intrinsics/add.h | 201 ---- arm_compute/core/NEON/wrapper/intrinsics/and.h | 60 -- arm_compute/core/NEON/wrapper/intrinsics/bsl.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/ceq.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/cge.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/cgt.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/cle.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/clt.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/combine.h | 53 - arm_compute/core/NEON/wrapper/intrinsics/cvt.h | 96 -- arm_compute/core/NEON/wrapper/intrinsics/div.h | 73 -- arm_compute/core/NEON/wrapper/intrinsics/dup_n.h | 66 -- arm_compute/core/NEON/wrapper/intrinsics/eor.h | 56 -- arm_compute/core/NEON/wrapper/intrinsics/exp.h | 56 -- arm_compute/core/NEON/wrapper/intrinsics/ext.h | 62 -- arm_compute/core/NEON/wrapper/intrinsics/gethigh.h | 53 - arm_compute/core/NEON/wrapper/intrinsics/getlane.h | 223 ----- arm_compute/core/NEON/wrapper/intrinsics/getlow.h | 53 - .../core/NEON/wrapper/intrinsics/intrinsics.h | 74 -- arm_compute/core/NEON/wrapper/intrinsics/inv.h | 62 -- arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h | 61 -- arm_compute/core/NEON/wrapper/intrinsics/load.h | 73 -- arm_compute/core/NEON/wrapper/intrinsics/log.h | 56 -- arm_compute/core/NEON/wrapper/intrinsics/max.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/min.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/mla.h | 71 -- arm_compute/core/NEON/wrapper/intrinsics/movl.h | 49 - arm_compute/core/NEON/wrapper/intrinsics/movn.h | 62 -- arm_compute/core/NEON/wrapper/intrinsics/mul.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/neg.h | 58 -- arm_compute/core/NEON/wrapper/intrinsics/not.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/orr.h | 60 -- arm_compute/core/NEON/wrapper/intrinsics/pmax.h | 53 - arm_compute/core/NEON/wrapper/intrinsics/pmin.h | 53 - arm_compute/core/NEON/wrapper/intrinsics/pow.h | 48 - arm_compute/core/NEON/wrapper/intrinsics/qmov.h | 49 - arm_compute/core/NEON/wrapper/intrinsics/qmovun.h | 46 - .../core/NEON/wrapper/intrinsics/reinterpret.h | 49 - arm_compute/core/NEON/wrapper/intrinsics/rev64.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/round.h | 56 -- arm_compute/core/NEON/wrapper/intrinsics/setlane.h | 208 ---- arm_compute/core/NEON/wrapper/intrinsics/sin.h | 57 -- arm_compute/core/NEON/wrapper/intrinsics/store.h | 70 -- arm_compute/core/NEON/wrapper/intrinsics/sub.h | 103 -- arm_compute/core/NEON/wrapper/intrinsics/tanh.h | 47 - arm_compute/core/NEON/wrapper/intrinsics/tbl.h | 45 - arm_compute/core/NEON/wrapper/scalar/add.h | 69 -- arm_compute/core/NEON/wrapper/scalar/scalar.h | 30 - arm_compute/core/NEON/wrapper/scalar/sub.h | 69 -- arm_compute/core/NEON/wrapper/traits.h | 140 --- arm_compute/core/NEON/wrapper/wrapper.h | 34 - docs/ComputeLibrary.dir | 48 +- src/core/NEON/NEAsymm.h | 753 ++++++++++++++ src/core/NEON/NEAsymm.inl | 92 ++ src/core/NEON/NEFixedPoint.h | 41 + src/core/NEON/NEFixedPoint.inl | 43 + src/core/NEON/NEMath.h | 307 ++++++ src/core/NEON/NEMath.inl | 529 ++++++++++ src/core/NEON/NESymm.h | 256 +++++ src/core/NEON/kernels/NEActivationLayerKernel.cpp | 6 +- .../NEON/kernels/NEArithmeticAdditionKernel.cpp | 2 +- .../NEON/kernels/NEArithmeticSubtractionKernel.cpp | 6 +- .../NEON/kernels/NEBatchConcatenateLayerKernel.cpp | 4 +- .../kernels/NEBatchNormalizationLayerKernel.cpp | 8 +- src/core/NEON/kernels/NEBitwiseAndKernel.cpp | 4 +- src/core/NEON/kernels/NEColorConvertKernel.cpp | 5 +- .../kernels/NEConvertQuantizedSignednessKernel.cpp | 4 +- src/core/NEON/kernels/NECropKernel.cpp | 2 +- .../NEON/kernels/NEDepthConcatenateLayerKernel.cpp | 6 +- .../NEON/kernels/NEDepthConvertLayerKernel.cpp | 6 +- .../NEON/kernels/NEDepthToSpaceLayerKernel.cpp | 2 +- .../NEDepthwiseConvolutionLayerNativeKernel.cpp | 4 +- .../NEON/kernels/NEDequantizationLayerKernel.cpp | 8 +- .../kernels/NEDirectConvolutionLayerKernel.cpp | 8 +- .../NEDirectConvolutionLayerOutputStageKernel.cpp | 6 +- .../NEON/kernels/NEElementwiseOperationKernel.cpp | 6 +- src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp | 2 +- src/core/NEON/kernels/NEFFTRadixStageKernel.cpp | 6 +- src/core/NEON/kernels/NEFFTScaleKernel.cpp | 4 +- src/core/NEON/kernels/NEFloorKernel.cpp | 2 +- .../kernels/NEFuseBatchNormalizationKernel.cpp | 2 +- ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 4 +- .../NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp | 2 +- ...tizeDownInt32ToInt16ScaleByFixedPointKernel.cpp | 2 +- ...ntizeDownInt32ToInt8ScaleByFixedPointKernel.cpp | 2 +- ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp | 2 +- .../NEON/kernels/NEGEMMLowpReductionKernel.cpp | 2 +- .../NEON/kernels/NEGEMMMatrixAdditionKernel.cpp | 4 +- .../NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp | 4 +- .../kernels/NEHeightConcatenateLayerKernel.cpp | 4 +- .../kernels/NEInstanceNormalizationLayerKernel.cpp | 4 +- src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp | 4 +- .../NELocallyConnectedMatrixMultiplyKernel.cpp | 4 +- .../kernels/NEMeanStdDevNormalizationKernel.cpp | 6 +- .../NEON/kernels/NENormalizationLayerKernel.cpp | 6 +- src/core/NEON/kernels/NEPadLayerKernel.cpp | 4 +- .../kernels/NEPixelWiseMultiplicationKernel.cpp | 6 +- src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 11 +- .../kernels/NEQLSTMLayerNormalizationKernel.cpp | 9 +- .../NEON/kernels/NEQuantizationLayerKernel.cpp | 6 +- src/core/NEON/kernels/NERangeKernel.cpp | 4 +- .../NEON/kernels/NEReductionOperationKernel.cpp | 4 +- src/core/NEON/kernels/NEReverseKernel.cpp | 2 +- src/core/NEON/kernels/NEScaleKernel.cpp | 2 +- src/core/NEON/kernels/NESelectKernel.cpp | 4 +- src/core/NEON/kernels/NESoftmaxLayerKernel.cpp | 6 +- .../NEON/kernels/NESpaceToBatchLayerKernel.cpp | 2 +- .../NEON/kernels/NESpaceToDepthLayerKernel.cpp | 4 +- src/core/NEON/kernels/NEThresholdKernel.cpp | 2 +- src/core/NEON/kernels/NEUpsampleLayerKernel.cpp | 2 +- .../NEON/kernels/NEWidthConcatenateLayerKernel.cpp | 4 +- src/core/NEON/kernels/NEYOLOLayerKernel.cpp | 9 +- .../kernels/detail/NEActivationFunctionDetail.h | 315 ++++++ .../NEON/kernels/detail/NEColorConvertHelper.inl | 1045 ++++++++++++++++++++ .../NEON/kernels/detail/NEDirectConvolution3x3.h | 170 ++++ .../kernels/detail/NEDirectConvolutionDetail.h | 965 ++++++++++++++++++ src/core/NEON/wrapper/intrinsics/abs.h | 75 ++ src/core/NEON/wrapper/intrinsics/add.h | 201 ++++ src/core/NEON/wrapper/intrinsics/and.h | 60 ++ src/core/NEON/wrapper/intrinsics/bsl.h | 64 ++ src/core/NEON/wrapper/intrinsics/ceq.h | 64 ++ src/core/NEON/wrapper/intrinsics/cge.h | 64 ++ src/core/NEON/wrapper/intrinsics/cgt.h | 64 ++ src/core/NEON/wrapper/intrinsics/cle.h | 64 ++ src/core/NEON/wrapper/intrinsics/clt.h | 64 ++ src/core/NEON/wrapper/intrinsics/combine.h | 53 + src/core/NEON/wrapper/intrinsics/cvt.h | 96 ++ src/core/NEON/wrapper/intrinsics/div.h | 73 ++ src/core/NEON/wrapper/intrinsics/dup_n.h | 66 ++ src/core/NEON/wrapper/intrinsics/eor.h | 56 ++ src/core/NEON/wrapper/intrinsics/exp.h | 56 ++ src/core/NEON/wrapper/intrinsics/ext.h | 62 ++ src/core/NEON/wrapper/intrinsics/gethigh.h | 53 + src/core/NEON/wrapper/intrinsics/getlane.h | 223 +++++ src/core/NEON/wrapper/intrinsics/getlow.h | 53 + src/core/NEON/wrapper/intrinsics/intrinsics.h | 74 ++ src/core/NEON/wrapper/intrinsics/inv.h | 62 ++ src/core/NEON/wrapper/intrinsics/invsqrt.h | 61 ++ src/core/NEON/wrapper/intrinsics/load.h | 73 ++ src/core/NEON/wrapper/intrinsics/log.h | 56 ++ src/core/NEON/wrapper/intrinsics/max.h | 64 ++ src/core/NEON/wrapper/intrinsics/min.h | 64 ++ src/core/NEON/wrapper/intrinsics/mla.h | 71 ++ src/core/NEON/wrapper/intrinsics/movl.h | 49 + src/core/NEON/wrapper/intrinsics/movn.h | 62 ++ src/core/NEON/wrapper/intrinsics/mul.h | 64 ++ src/core/NEON/wrapper/intrinsics/neg.h | 58 ++ src/core/NEON/wrapper/intrinsics/not.h | 64 ++ src/core/NEON/wrapper/intrinsics/orr.h | 60 ++ src/core/NEON/wrapper/intrinsics/pmax.h | 53 + src/core/NEON/wrapper/intrinsics/pmin.h | 53 + src/core/NEON/wrapper/intrinsics/pow.h | 48 + src/core/NEON/wrapper/intrinsics/qmov.h | 49 + src/core/NEON/wrapper/intrinsics/qmovun.h | 46 + src/core/NEON/wrapper/intrinsics/reinterpret.h | 49 + src/core/NEON/wrapper/intrinsics/rev64.h | 64 ++ src/core/NEON/wrapper/intrinsics/round.h | 56 ++ src/core/NEON/wrapper/intrinsics/setlane.h | 208 ++++ src/core/NEON/wrapper/intrinsics/sin.h | 57 ++ src/core/NEON/wrapper/intrinsics/store.h | 70 ++ src/core/NEON/wrapper/intrinsics/sub.h | 103 ++ src/core/NEON/wrapper/intrinsics/tanh.h | 47 + src/core/NEON/wrapper/intrinsics/tbl.h | 45 + src/core/NEON/wrapper/scalar/add.h | 69 ++ src/core/NEON/wrapper/scalar/scalar.h | 30 + src/core/NEON/wrapper/scalar/sub.h | 69 ++ src/core/NEON/wrapper/traits.h | 140 +++ src/core/NEON/wrapper/wrapper.h | 34 + 180 files changed, 8342 insertions(+), 8338 deletions(-) delete mode 100644 arm_compute/core/NEON/NEAsymm.h delete mode 100644 arm_compute/core/NEON/NEAsymm.inl delete mode 100644 arm_compute/core/NEON/NEColorConvertHelper.inl delete mode 100644 arm_compute/core/NEON/NEFixedPoint.h delete mode 100644 arm_compute/core/NEON/NEFixedPoint.inl delete mode 100644 arm_compute/core/NEON/NEMath.h delete mode 100644 arm_compute/core/NEON/NEMath.inl delete mode 100644 arm_compute/core/NEON/NESymm.h delete mode 100644 arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h delete mode 100644 arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h delete mode 100644 arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/abs.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/add.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/and.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/bsl.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/ceq.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/cge.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/cgt.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/cle.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/clt.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/combine.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/cvt.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/div.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/dup_n.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/eor.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/exp.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/ext.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/gethigh.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/getlane.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/getlow.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/inv.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/load.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/log.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/max.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/min.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/mla.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/movl.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/movn.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/mul.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/neg.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/not.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/orr.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/pmax.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/pmin.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/pow.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/qmov.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/qmovun.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/rev64.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/round.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/setlane.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/sin.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/store.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/sub.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/tanh.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/tbl.h delete mode 100644 arm_compute/core/NEON/wrapper/scalar/add.h delete mode 100644 arm_compute/core/NEON/wrapper/scalar/scalar.h delete mode 100644 arm_compute/core/NEON/wrapper/scalar/sub.h delete mode 100644 arm_compute/core/NEON/wrapper/traits.h delete mode 100644 arm_compute/core/NEON/wrapper/wrapper.h create mode 100644 src/core/NEON/NEAsymm.h create mode 100644 src/core/NEON/NEAsymm.inl create mode 100644 src/core/NEON/NEFixedPoint.h create mode 100644 src/core/NEON/NEFixedPoint.inl create mode 100644 src/core/NEON/NEMath.h create mode 100644 src/core/NEON/NEMath.inl create mode 100644 src/core/NEON/NESymm.h create mode 100644 src/core/NEON/kernels/detail/NEActivationFunctionDetail.h create mode 100644 src/core/NEON/kernels/detail/NEColorConvertHelper.inl create mode 100644 src/core/NEON/kernels/detail/NEDirectConvolution3x3.h create mode 100644 src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h create mode 100644 src/core/NEON/wrapper/intrinsics/abs.h create mode 100644 src/core/NEON/wrapper/intrinsics/add.h create mode 100644 src/core/NEON/wrapper/intrinsics/and.h create mode 100644 src/core/NEON/wrapper/intrinsics/bsl.h create mode 100644 src/core/NEON/wrapper/intrinsics/ceq.h create mode 100644 src/core/NEON/wrapper/intrinsics/cge.h create mode 100644 src/core/NEON/wrapper/intrinsics/cgt.h create mode 100644 src/core/NEON/wrapper/intrinsics/cle.h create mode 100644 src/core/NEON/wrapper/intrinsics/clt.h create mode 100644 src/core/NEON/wrapper/intrinsics/combine.h create mode 100644 src/core/NEON/wrapper/intrinsics/cvt.h create mode 100644 src/core/NEON/wrapper/intrinsics/div.h create mode 100644 src/core/NEON/wrapper/intrinsics/dup_n.h create mode 100644 src/core/NEON/wrapper/intrinsics/eor.h create mode 100644 src/core/NEON/wrapper/intrinsics/exp.h create mode 100644 src/core/NEON/wrapper/intrinsics/ext.h create mode 100644 src/core/NEON/wrapper/intrinsics/gethigh.h create mode 100644 src/core/NEON/wrapper/intrinsics/getlane.h create mode 100644 src/core/NEON/wrapper/intrinsics/getlow.h create mode 100644 src/core/NEON/wrapper/intrinsics/intrinsics.h create mode 100644 src/core/NEON/wrapper/intrinsics/inv.h create mode 100644 src/core/NEON/wrapper/intrinsics/invsqrt.h create mode 100644 src/core/NEON/wrapper/intrinsics/load.h create mode 100644 src/core/NEON/wrapper/intrinsics/log.h create mode 100644 src/core/NEON/wrapper/intrinsics/max.h create mode 100644 src/core/NEON/wrapper/intrinsics/min.h create mode 100644 src/core/NEON/wrapper/intrinsics/mla.h create mode 100644 src/core/NEON/wrapper/intrinsics/movl.h create mode 100644 src/core/NEON/wrapper/intrinsics/movn.h create mode 100644 src/core/NEON/wrapper/intrinsics/mul.h create mode 100644 src/core/NEON/wrapper/intrinsics/neg.h create mode 100644 src/core/NEON/wrapper/intrinsics/not.h create mode 100644 src/core/NEON/wrapper/intrinsics/orr.h create mode 100644 src/core/NEON/wrapper/intrinsics/pmax.h create mode 100644 src/core/NEON/wrapper/intrinsics/pmin.h create mode 100644 src/core/NEON/wrapper/intrinsics/pow.h create mode 100644 src/core/NEON/wrapper/intrinsics/qmov.h create mode 100644 src/core/NEON/wrapper/intrinsics/qmovun.h create mode 100644 src/core/NEON/wrapper/intrinsics/reinterpret.h create mode 100644 src/core/NEON/wrapper/intrinsics/rev64.h create mode 100644 src/core/NEON/wrapper/intrinsics/round.h create mode 100644 src/core/NEON/wrapper/intrinsics/setlane.h create mode 100644 src/core/NEON/wrapper/intrinsics/sin.h create mode 100644 src/core/NEON/wrapper/intrinsics/store.h create mode 100644 src/core/NEON/wrapper/intrinsics/sub.h create mode 100644 src/core/NEON/wrapper/intrinsics/tanh.h create mode 100644 src/core/NEON/wrapper/intrinsics/tbl.h create mode 100644 src/core/NEON/wrapper/scalar/add.h create mode 100644 src/core/NEON/wrapper/scalar/scalar.h create mode 100644 src/core/NEON/wrapper/scalar/sub.h create mode 100644 src/core/NEON/wrapper/traits.h create mode 100644 src/core/NEON/wrapper/wrapper.h diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h deleted file mode 100644 index d5d824e9ca..0000000000 --- a/arm_compute/core/NEON/NEAsymm.h +++ /dev/null @@ -1,753 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEASYMM_H -#define ARM_COMPUTE_NEASYMM_H - -#include "arm_compute/core/NEON/NEMath.h" -#include - -namespace arm_compute -{ -using qasymm8x8_t = uint8x8_t; /**< 8 bit quantized asymmetric vector with 8 elements */ -using qasymm8x8x2_t = uint8x8x2_t; /**< 8 bit quantized asymmetric vector with 16 elements */ -using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 24 elements */ -using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */ -using qasymm8x16_t = uint8x16_t; /**< 8 bit quantized asymmetric vector with 16 elements */ - -using qasymm8x8_signed_t = int8x8_t; /**< 8 bit quantized signed asymmetric vector with 8 elements */ -using qasymm8x8x2_signed_t = int8x8x2_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */ -using qasymm8x8x3_signed_t = int8x8x3_t; /**< 8 bit quantized signed asymmetric vector with 24 elements */ -using qasymm8x8x4_signed_t = int8x8x4_t; /**< 8 bit quantized signed asymmetric vector with 32 elements */ -using qasymm8x16_signed_t = int8x16_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */ - -/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector - * - * vd*vs + vo - * - * @param[in] vd Input vector value in QASYMM8 format - * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes. - * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes. - * - * @return A 16-component vector in QASYMM8 format, saturated to fit - */ -uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo); - -/** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector - * - * vd*vs + vo - * - * @param[in] vd Input vector value in QASYMM8_SIGNED format - * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes. - * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes. - * - * @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit - */ -int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo); - -/** Performs final quantization step on 16 elements - * - * @param[in] in_s32 Input to be quantized. - * @param[in] result_fixedpoint_multiplier Result multiplier parameter - * @param[in] result_shift Result shift parameter - * @param[in] result_offset_after_shift_s32 Result offset parameter - * @param[in] min_u8 Relu lower bound - * @param[in] max_u8 Relu upper bound - * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied - * - * @return Quantized values - */ -inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, - int result_fixedpoint_multiplier, - int32_t result_shift, - int32x4_t result_offset_after_shift_s32, - uint8x16_t min_u8, - uint8x16_t max_u8, - bool is_bounded_relu) -{ - const static int32x4_t zero_s32 = vdupq_n_s32(0); - - if(result_shift < 0) - { - in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift))); - in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift))); - in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift))); - in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift))); - - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); - in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); - } - else - { - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); - in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); - - // Round to the nearest division by a power-of-two using result_shift_s32 - in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); - in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift); - in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift); - in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift); - } - - // Add the offset terms - in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32); - in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32); - in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32); - in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32); - - // Saturate negative values - in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); - in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); - in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); - in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); - - // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; - - // Convert S16 to U8 - uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); - - if(is_bounded_relu) - { - out_u8 = vmaxq_u8(out_u8, min_u8); - out_u8 = vminq_u8(out_u8, max_u8); - } - - return out_u8; -} - -/** Performs final quantization step on 16 elements - * - * @param[in] in_s32 Input to be quantized. - * @param[in] result_fixedpoint_multiplier Result multiplier parameter - * @param[in] result_shift Result shift parameter - * @param[in] result_offset_after_shift_s32 Result offset parameter - * @param[in] min_s8 Relu lower bound - * @param[in] max_s8 Relu upper bound - * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied - * - * @return Quantized values - */ -inline int8x16_t finalize_quantization(int32x4x4_t &in_s32, - int result_fixedpoint_multiplier, - int32_t result_shift, - int32x4_t result_offset_after_shift_s32, - int8x16_t min_s8, - int8x16_t max_s8, - bool is_bounded_relu) -{ - if(result_shift < 0) - { - in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift))); - in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift))); - in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift))); - in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift))); - - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); - in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); - } - else - { - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); - in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); - - // Round to the nearest division by a power-of-two using result_shift_s32 - in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); - in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift); - in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift); - in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift); - } - - // Add the offset terms - in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32); - in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32); - in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32); - in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32); - - // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; - - // Convert S16 to S8 - int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - - if(is_bounded_relu) - { - out_s8 = vmaxq_s8(out_s8, min_s8); - out_s8 = vminq_s8(out_s8, max_s8); - } - - return out_s8; -} - -/** Performs final quantization step on 16 elements for symmetric quantization - * - * @param[in] in_s32 Input to be quantized. - * @param[in] result_fixedpoint_multiplier Result multiplier parameter - * @param[in] result_shift Result shift parameter - * @param[in] result_offset_after_shift_s32 Result offset parameter - * @param[in] min_s8 Relu lower bound - * @param[in] max_s8 Relu upper bound - * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied - * - * @return Quantized values - */ -inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, - const int32x4x4_t &result_fixedpoint_multiplier, - const int32x4x4_t &result_shift, - const int32x4_t &result_offset_after_shift_s32, - const int8x16_t &min_s8, - const int8x16_t &max_s8, - const bool is_bounded_relu) -{ - const static int32x4_t one_s32 = vdupq_n_s32(1); - - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - int32x4x4_t res_shift_gt0 = - { - vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]), - vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]), - vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]), - vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]), - }; - // Round to the nearest division by a power-of-two using result_shift_s32 - res_shift_gt0.val[0] = rounding_divide_by_pow2(res_shift_gt0.val[0], result_shift.val[0]); - res_shift_gt0.val[1] = rounding_divide_by_pow2(res_shift_gt0.val[1], result_shift.val[1]); - res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]); - res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]); - - int32x4x4_t res_shift_lt0 = - { - vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))), - vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))), - vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))), - vmulq_s32(in_s32.val[3], vshlq_s32(one_s32, vnegq_s32(result_shift.val[3]))), - }; - res_shift_lt0.val[0] = vqrdmulhq_s32(res_shift_lt0.val[0], result_fixedpoint_multiplier.val[0]); - res_shift_lt0.val[1] = vqrdmulhq_s32(res_shift_lt0.val[1], result_fixedpoint_multiplier.val[1]); - res_shift_lt0.val[2] = vqrdmulhq_s32(res_shift_lt0.val[2], result_fixedpoint_multiplier.val[2]); - res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]); - - // Select result depending on shift value - const uint32x4x4_t mask_lt0 = - { -#ifdef __aarch64__ - vcltzq_s32(result_shift.val[0]), - vcltzq_s32(result_shift.val[1]), - vcltzq_s32(result_shift.val[2]), - vcltzq_s32(result_shift.val[3]), -#else //__aarch64__ - vcltq_s32(result_shift.val[0], vdupq_n_s32(0)), - vcltq_s32(result_shift.val[1], vdupq_n_s32(0)), - vcltq_s32(result_shift.val[2], vdupq_n_s32(0)), - vcltq_s32(result_shift.val[3], vdupq_n_s32(0)), -#endif //__aarch64__ - }; - - in_s32.val[0] = vbslq_s32(mask_lt0.val[0], res_shift_lt0.val[0], res_shift_gt0.val[0]); - in_s32.val[1] = vbslq_s32(mask_lt0.val[1], res_shift_lt0.val[1], res_shift_gt0.val[1]); - in_s32.val[2] = vbslq_s32(mask_lt0.val[2], res_shift_lt0.val[2], res_shift_gt0.val[2]); - in_s32.val[3] = vbslq_s32(mask_lt0.val[3], res_shift_lt0.val[3], res_shift_gt0.val[3]); - - // Add the offset terms - in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32); - in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32); - in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32); - in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32); - - // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; - - // Convert S16 to S8 - int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - - if(is_bounded_relu) - { - out_s8 = vmaxq_s8(out_s8, min_s8); - out_s8 = vminq_s8(out_s8, max_s8); - } - - return out_s8; -} - -/** Performs final quantization step on single element - * - * @param[in] in_value Input to be quantized. - * @param[in] result_fixedpoint_multiplier Result multiplier parameter - * @param[in] result_shift Result shift parameter - * @param[in] result_offset_after_shift_s32 Result offset parameter - * @param[in] min_u8 Relu lower bound - * @param[in] max_u8 Relu upper bound - * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied - * - * @return Quantized value - */ -inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, - int32_t result_shift, int32_t result_offset_after_shift_s32, - uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu) -{ - int32x4_t in_s32 = vdupq_n_s32(in_value); - - if(result_shift < 0) - { - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); - } - else - { - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); - // Shift value by result_shift_s32 - in_value = rounding_divide_by_pow2(in_value, result_shift); - } - - // Add the offset term - in_value += result_offset_after_shift_s32; - - // Bound the result - uint8_t out_u8 = static_cast(std::max(0, std::min(255, in_value))); - if(is_bounded_relu) - { - out_u8 = static_cast(std::max(min_u8, std::min(max_u8, out_u8))); - } - - return out_u8; -} - -/** Performs final quantization step on single element - * - * @param[in] in_value Input to be quantized. - * @param[in] result_fixedpoint_multiplier Result multiplier parameter - * @param[in] result_shift Result shift parameter - * @param[in] result_offset_after_shift_s32 Result offset parameter - * @param[in] min_s8 Relu lower bound - * @param[in] max_s8 Relu upper bound - * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied - * - * @return Quantized value - */ -inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, - int32_t result_shift, int32_t result_offset_after_shift_s32, - int8_t min_s8, int8_t max_s8, bool is_bounded_relu) -{ - int32x4_t in_s32 = vdupq_n_s32(in_value); - - if(result_shift < 0) - { - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); - } - else - { - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); - - // Shift value by result_shift_s32 - in_value = rounding_divide_by_pow2(in_value, result_shift); - } - - // Add the offset term - in_value += result_offset_after_shift_s32; - - // Bound the result - int8_t out_s8 = static_cast(std::max(-128, std::min(127, in_value))); - if(is_bounded_relu) - { - out_s8 = static_cast(std::max(min_s8, std::min(max_s8, out_s8))); - } - - return out_s8; -} - -/** Dequantize a neon vector holding 8 quantized values. - * - * @param[in] qv Input values to be dequantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return Dequantized values in a neon vector - */ -inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi) -{ - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x2_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale), - } - }; - return vdequantized_input; -} - -/** Dequantize a neon vector holding 8 singed quantized values. - * - * @param[in] qv Input values to be dequantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return Dequantized values in a neon vector - */ -inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi) -{ - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x2_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale), - } - }; - return vdequantized_input; -} - -/** Dequantize a neon vector holding 16 quantized values. - * - * @param[in] qv Input values to be dequantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return Dequantized values in a neon vector - */ -inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi) -{ - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - } - }; - return vdequantized_input; -} - -/** Dequantize a neon vector holding 16 signed quantized values. - * - * @param[in] qv Input values to be dequantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return Dequantized values in a neon vector - */ -inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi) -{ - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - } - }; - return vdequantized_input; -} - -/** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values. - * - * @param[in] qv Input values to be dequantized. - * @param[in] scale Quantization scaling factor. - * @param[in] offset Zero quantization offset. - * - * @return Dequantized values in a neon vector - */ -inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset) -{ - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - } - }; - return vdequantized_input; -} - -/** Dequantize a vector of 16 values stored as signed asymmetric. - * - * @param[in] qv Input values to be dequantized. - * @param[in] scale Quantization scaling factor. - * @param[in] offset Zero quantization offset. - * - * @return Dequantized values in a neon vector - */ -inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset) -{ - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - } - }; - return vdequantized_input; -} - -/** Dequantize following symmetric quantization scheme a neon vector holding 16 quantized values. - * - * @param[in] qv Input values to be dequantized. - * @param[in] vscale Vector containing quantization scaling factors. - * - * @return Dequantized values in a neon vector - */ -inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale) -{ - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]), - } - }; - return vdequantized_input; -} - -/** Dequantize following a symmetric quantization scheme a neon vector holding 16 quantized values. - * - * @param[in] qv Input values to be dequantized. - * @param[in] scale Quantization scaling factor. - * - * @return Dequantized values in a neon vector - */ -inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale) -{ - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale), - } - }; - return vdequantized_input; -} - -/** Quantize a neon vector holding 8 floating point values. - * - * @param[in] qv Input values to be quantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return A neon vector holding the quantized values - */ -inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi) -{ - const float scale = qi.scale; - const int offset = qi.offset; - const float32x4_t voffset = vdupq_n_f32(offset); - const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), -#endif //__aarch64__ - } - }; - return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); -} - -/** Quantize a neon vector holding 8 floating point values. - * - * @param[in] qv Input values to be quantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return A neon vector holding the singed quantized values - */ -inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi) -{ - const float scale = qi.scale; - const int offset = qi.offset; - const float32x4_t voffset = vdupq_n_f32(offset); - const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), -#endif //__aarch64__ - } - }; - return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); -} - -/** Quantize a neon vector holding 16 floating point values. - * - * @param[in] qv Input values to be quantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return A neon vector holding the quantized values - */ -inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) -{ - const float scale = qi.scale; - const int offset = qi.offset; - const float32x4_t voffset = vdupq_n_f32(offset); - const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), -#endif //__aarch64__ - } - }; - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); - return vcombine_u8(pa, pb); -} - -/** Signed quantize a neon vector holding 16 floating point values. - * - * @param[in] qv Input values to be quantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return A neon vector holding the quantized values - */ -inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) -{ - const float scale = qi.scale; - const int offset = qi.offset; - const float32x4_t voffset = vdupq_n_f32(offset); - const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), -#endif //__aarch64__ - } - }; - const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); - const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); - return vcombine_s8(pa, pb); -} - -/** Quantize to QASYMM16 a neon vector holding 16 floating point values. - * - * @param[in] qv Input values to be quantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return A neon vector holding the quantized values - */ -inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) -{ - const float scale = qi.scale; - const int offset = qi.offset; - const float32x4_t voffset = vdupq_n_f32(offset); - const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), -#endif //__aarch64__ - } - }; - const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1])); - const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3])); - return { pa, pb }; -} -} // namespace arm_compute -#include "arm_compute/core/NEON/NEAsymm.inl" -#endif // ARM_COMPUTE_NEASYMM_H diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl deleted file mode 100644 index d211382f7a..0000000000 --- a/arm_compute/core/NEON/NEAsymm.inl +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2017-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -namespace arm_compute -{ -inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo) -{ - // Convert uint8 vectors to uint16 vectors - const uint8x8_t vd_low = vget_low_u8(vd); - const uint8x8_t vd_high = vget_high_u8(vd); - uint16x8_t vd_low_u16x8 = vmovl_u8(vd_low); - uint16x8_t vd_high_u16x8 = vmovl_u8(vd_high); - // Convert uint16 vectors to uint32 vectors - uint32x4_t A_u32x4 = vmovl_u16(vget_low_u16(vd_low_u16x8)); - uint32x4_t B_u32x4 = vmovl_u16(vget_high_u16(vd_low_u16x8)); - uint32x4_t C_u32x4 = vmovl_u16(vget_low_u16(vd_high_u16x8)); - uint32x4_t D_u32x4 = vmovl_u16(vget_high_u16(vd_high_u16x8)); - // Convert uint32 vectors to float32 vectors - float32x4_t A_f32x4 = vcvtq_f32_u32(A_u32x4); - float32x4_t B_f32x4 = vcvtq_f32_u32(B_u32x4); - float32x4_t C_f32x4 = vcvtq_f32_u32(C_u32x4); - float32x4_t D_f32x4 = vcvtq_f32_u32(D_u32x4); - // vd = vd*vs + vo - A_f32x4 = vmlaq_f32(vo, A_f32x4, vs); - B_f32x4 = vmlaq_f32(vo, B_f32x4, vs); - C_f32x4 = vmlaq_f32(vo, C_f32x4, vs); - D_f32x4 = vmlaq_f32(vo, D_f32x4, vs); - // Convert float32 vectors to uint32 vectors - A_u32x4 = vcvtq_u32_f32(A_f32x4); - B_u32x4 = vcvtq_u32_f32(B_f32x4); - C_u32x4 = vcvtq_u32_f32(C_f32x4); - D_u32x4 = vcvtq_u32_f32(D_f32x4); - // Convert uint32 vectors to uint16 vectors (with saturation) - vd_low_u16x8 = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4)); - vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4)); - // convert uint16 vectors to uint8 vectors (with saturation) - return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8)); -} -inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo) -{ - // Convert uint8 vectors to int16 vectors - const int8x8_t vd_low = vget_low_s8(vd); - const int8x8_t vd_high = vget_high_s8(vd); - int16x8_t vd_low_s16x8 = vmovl_s8(vd_low); - int16x8_t vd_high_s16x8 = vmovl_s8(vd_high); - // Convert int16 vectors to int32 vectors - int32x4_t A_s32x4 = vmovl_s16(vget_low_s16(vd_low_s16x8)); - int32x4_t B_s32x4 = vmovl_s16(vget_high_s16(vd_low_s16x8)); - int32x4_t C_s32x4 = vmovl_s16(vget_low_s16(vd_high_s16x8)); - int32x4_t D_s32x4 = vmovl_s16(vget_high_s16(vd_high_s16x8)); - // Convert int32 vectors to float32 vectors - float32x4_t A_f32x4 = vcvtq_f32_s32(A_s32x4); - float32x4_t B_f32x4 = vcvtq_f32_s32(B_s32x4); - float32x4_t C_f32x4 = vcvtq_f32_s32(C_s32x4); - float32x4_t D_f32x4 = vcvtq_f32_s32(D_s32x4); - // vd = vd*vs + vo - A_f32x4 = vmlaq_f32(vo, A_f32x4, vs); - B_f32x4 = vmlaq_f32(vo, B_f32x4, vs); - C_f32x4 = vmlaq_f32(vo, C_f32x4, vs); - D_f32x4 = vmlaq_f32(vo, D_f32x4, vs); - // Convert float32 vectors to int32 vectors - A_s32x4 = vcvtq_s32_f32(A_f32x4); - B_s32x4 = vcvtq_s32_f32(B_f32x4); - C_s32x4 = vcvtq_s32_f32(C_f32x4); - D_s32x4 = vcvtq_s32_f32(D_f32x4); - // Convert int32 vectors to int16 vectors (with saturation) - vd_low_s16x8 = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4)); - vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4)); - // convert int16 vectors to int8 vectors (with saturation) - return vcombine_s8(vqmovn_s16(vd_low_s16x8), vqmovn_s16(vd_high_s16x8)); -} -} // namespace arm_compute diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/arm_compute/core/NEON/NEColorConvertHelper.inl deleted file mode 100644 index 9fc1be5406..0000000000 --- a/arm_compute/core/NEON/NEColorConvertHelper.inl +++ /dev/null @@ -1,1045 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IMultiImage.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/Utils.h" - -#include - -namespace -{ -#ifndef DOXYGEN_SKIP_THIS -constexpr float red_coef_bt709 = 1.5748F; -constexpr float green_coef_bt709 = -0.1873f; -constexpr float green_coef2_bt709 = -0.4681f; -constexpr float blue_coef_bt709 = 1.8556f; - -constexpr float rgb2yuv_bt709_kr = 0.2126f; -constexpr float rgb2yuv_bt709_kb = 0.0722f; -// K_g = 1 - K_r - K_b -constexpr float rgb2yuv_bt709_kg = 0.7152f; -// C_u = 1 / (2 * (1 - K_b)) -constexpr float rgb2yuv_bt709_cu = 0.5389f; -// C_v = 1 / (2 * (1 - K_r)) -constexpr float rgb2yuv_bt709_cv = 0.6350f; - -constexpr float rgb2u8_red_coef = 0.2126f; -constexpr float rgb2u8_green_coef = 0.7152f; -constexpr float rgb2u8_blue_coef = 0.0722f; - -inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor, - const float rcoef, const float gcoef, const float bcoef) -{ - float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef); - greyscale = vmlaq_n_f32(greyscale, gcolor, gcoef); - greyscale = vmlaq_n_f32(greyscale, bcolor, bcoef); - return greyscale; -} - -inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out) -{ - float32x4x4_t out_float32; - - //Conversion from 3(RGB) 4 uint8s to 3(RGB) 4 floats - const float32x4x4_t r_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[0]); - const float32x4x4_t g_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[1]); - const float32x4x4_t b_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[2]); - - //New grayscale image = ( (RED_COEFF * R) + (GREEN_COEFF * G) + (BLUE_COEFF * B) ) - //Computation of 1(Greyscale) 4 uint8 using 3(RGB) 4 uint8s float - out_float32.val[0] = rgb_to_greyscale_calculation(r_float32.val[0], g_float32.val[0], b_float32.val[0], - rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); - - out_float32.val[1] = rgb_to_greyscale_calculation(r_float32.val[1], g_float32.val[1], b_float32.val[1], - rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); - - out_float32.val[2] = rgb_to_greyscale_calculation(r_float32.val[2], g_float32.val[2], b_float32.val[2], - rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); - - out_float32.val[3] = rgb_to_greyscale_calculation(r_float32.val[3], g_float32.val[3], b_float32.val[3], - rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); - - //Conversion from 1(Greyscale) 4 floats to 1(Greyscale) 4 uint8s - arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out); -} - -inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec, - float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec) -{ - /* - Y'= 0.2126*R' + 0.7152*G' + 0.0722*B' - U'=-0.1146*R' - 0.3854*G' + 0.5000*B' - V'= 0.5000*R' - 0.4542*G' - 0.0458*B' - */ - const auto c128 = vdupq_n_f32(128.f); - - // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b - yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr); - yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg); - yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb); - - // U = (B - Y) / (2 * (1 - K_b)) - uvec = vsubq_f32(bvec, yvec); - uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu); - - // V = (R - Y) / (2 * (1 - K_r)) - vvec = vsubq_f32(rvec, yvec); - vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv); -} - -inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val, - float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha) -{ - float32x4x3_t rgb1, rgb2; - - // Compute: cb - 128 and cr - 128; - const auto c128 = vdupq_n_f32(128.f); - uvec_val = vsubq_f32(uvec_val, c128); - vvec_val = vsubq_f32(vvec_val, c128); - - // Compute: - // r = 0.0000f*f_u + 1.5748f*f_v; - // g = 0.1873f*f_u - 0.4681f*f_v; - // b = 1.8556f*f_u + 0.0000f*f_v; - const auto red = vmulq_n_f32(vvec_val, red_coef_bt709); - const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709); - const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), - vmulq_n_f32(vvec_val, green_coef2_bt709)); - - // Compute the final r,g,b values using y1 for the first texel and y2 for the second one. - // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t - // and written back to memory using vst3 instruction - - rgb1.val[0] = vaddq_f32(yvec_val, red); - rgb1.val[1] = vaddq_f32(yvec_val, green); - rgb1.val[2] = vaddq_f32(yvec_val, blue); - - rgb2.val[0] = vaddq_f32(yyvec_val, red); - rgb2.val[1] = vaddq_f32(yyvec_val, green); - rgb2.val[2] = vaddq_f32(yyvec_val, blue); - - uint8x8x3_t u8_rgb; - arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb); - - if(!alpha) - { - vst3_lane_u8(&output_ptr[0], u8_rgb, 0); - vst3_lane_u8(&output_ptr[3], u8_rgb, 4); - vst3_lane_u8(&output_ptr[6], u8_rgb, 1); - vst3_lane_u8(&output_ptr[9], u8_rgb, 5); - vst3_lane_u8(&output_ptr[12], u8_rgb, 2); - vst3_lane_u8(&output_ptr[15], u8_rgb, 6); - vst3_lane_u8(&output_ptr[18], u8_rgb, 3); - vst3_lane_u8(&output_ptr[21], u8_rgb, 7); - } - else - { - uint8x8x4_t u8_rgba; - u8_rgba.val[0] = u8_rgb.val[0]; - u8_rgba.val[1] = u8_rgb.val[1]; - u8_rgba.val[2] = u8_rgb.val[2]; - u8_rgba.val[3] = vdup_n_u8(255); - vst4_lane_u8(&output_ptr[0], u8_rgba, 0); - vst4_lane_u8(&output_ptr[4], u8_rgba, 4); - vst4_lane_u8(&output_ptr[8], u8_rgba, 1); - vst4_lane_u8(&output_ptr[12], u8_rgba, 5); - vst4_lane_u8(&output_ptr[16], u8_rgba, 2); - vst4_lane_u8(&output_ptr[20], u8_rgba, 6); - vst4_lane_u8(&output_ptr[24], u8_rgba, 3); - vst4_lane_u8(&output_ptr[28], u8_rgba, 7); - } -} - -inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha) -{ - uint8x16x3_t rgb; - - if(alpha) - { - const auto tmp = vld4q_u8(ptr); - rgb.val[0] = tmp.val[0]; - rgb.val[1] = tmp.val[1]; - rgb.val[2] = tmp.val[2]; - } - else - { - rgb = vld3q_u8(ptr); - } - - return rgb; -} - -inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom) -{ - // Convert the uint8x16_t to float32x4x4_t - const float32x4x4_t frvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[0]); - const float32x4x4_t fgvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[1]); - const float32x4x4_t fbvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[2]); - - const float32x4x4_t frvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[0]); - const float32x4x4_t fgvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[1]); - const float32x4x4_t fbvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[2]); - - float32x4x4_t fyvec_top, fuvec_top, fvvec_top; - float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom; - - for(auto i = 0; i < 4; ++i) - { - rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], - fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]); - rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], - fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]); - } - - arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]); - arm_compute::convert_float32x4x4_to_uint8x16(fuvec_top, vec_top.val[1]); - arm_compute::convert_float32x4x4_to_uint8x16(fvvec_top, vec_top.val[2]); - arm_compute::convert_float32x4x4_to_uint8x16(fyvec_bottom, vec_bottom.val[0]); - arm_compute::convert_float32x4x4_to_uint8x16(fuvec_bottom, vec_bottom.val[1]); - arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]); -} - -inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, - const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, - unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, - unsigned char *const __restrict out_uv) -{ - uint8x16x3_t vec_top, vec_bottom; - vec_top.val[0] = rvec_top; - vec_top.val[1] = gvec_top; - vec_top.val[2] = bvec_top; - vec_bottom.val[0] = rvec_bottom; - vec_bottom.val[1] = gvec_bottom; - vec_bottom.val[2] = bvec_bottom; - - rgb_to_yuv_conversion(vec_top, vec_bottom); - - vst1q_u8(out_y_top, vec_top.val[0]); - vst1q_u8(out_y_bottom, vec_bottom.val[0]); - - const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]); - const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]); - const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]); - const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]); - - uint8x8x2_t uvvec; - uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp)); - uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp)); - - vst2_u8(out_uv, uvvec); -} - -inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, - const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, - unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, - unsigned char *const __restrict out_u, - unsigned char *const __restrict out_v) -{ - uint8x16x3_t vec_top, vec_bottom; - vec_top.val[0] = rvec_top; - vec_top.val[1] = gvec_top; - vec_top.val[2] = bvec_top; - vec_bottom.val[0] = rvec_bottom; - vec_bottom.val[1] = gvec_bottom; - vec_bottom.val[2] = bvec_bottom; - - rgb_to_yuv_conversion(vec_top, vec_bottom); - - vst1q_u8(out_y_top, vec_top.val[0]); - vst1q_u8(out_y_bottom, vec_bottom.val[0]); - - const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]); - const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]); - const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), - vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1])); - - vst1_u8(out_u, vget_low_u8(uvvec)); - vst1_u8(out_v, vget_high_u8(uvvec)); -} - -inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec, - unsigned char *const __restrict out_y, - unsigned char *const __restrict out_u, - unsigned char *const __restrict out_v) -{ - // Convert the uint8x16_t to float32x4x4_t - const float32x4x4_t frvec = arm_compute::convert_uint8x16_to_float32x4x4(rvec); - const float32x4x4_t fgvec = arm_compute::convert_uint8x16_to_float32x4x4(gvec); - const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec); - - float32x4x4_t fyvec, fuvec, fvvec; - for(auto i = 0; i < 4; ++i) - { - rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], - fyvec.val[i], fuvec.val[i], fvvec.val[i]); - } - - uint8x16_t yvec, uvec, vvec; - arm_compute::convert_float32x4x4_to_uint8x16(fyvec, yvec); - arm_compute::convert_float32x4x4_to_uint8x16(fuvec, uvec); - arm_compute::convert_float32x4x4_to_uint8x16(fvvec, vvec); - - vst1q_u8(out_y, yvec); - vst1q_u8(out_u, uvec); - vst1q_u8(out_v, vvec); -} -#endif /* DOXYGEN_SKIP_THIS */ -} - -namespace arm_compute -{ -/** Convert RGB to RGBX. - * - * @param[in] input Input RGB data buffer. - * @param[out] output Output RGBX buffer. - * @param[in] win Window for iterating the buffers. - * - */ -void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - Iterator in(input_ptr, win); - Iterator out(output_ptr, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld3q_u8(in.ptr()); - uint8x16x4_t ta2; - ta2.val[0] = ta1.val[0]; - ta2.val[1] = ta1.val[1]; - ta2.val[2] = ta1.val[2]; - ta2.val[3] = vdupq_n_u8(255); - vst4q_u8(out.ptr(), ta2); - }, - in, out); -} - -/** Convert RGB to U8. - * - * @param[in] input Input RGB data buffer. - * @param[out] output Output U8 buffer. - * @param[in] win Window for iterating the buffers. - * - */ -void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - Iterator in(input_ptr, win); - Iterator out(output_ptr, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld3q_u8(in.ptr()); - uint8x16_t ta2; - rgb_to_u8_conversion(ta1, ta2); - vst1q_u8(out.ptr(), ta2); - }, - in, out); -} - -/** Convert RGBX to RGB. - * - * @param[in] input Input RGBX data buffer. - * @param[out] output Output RGB buffer. - * @param[in] win Window for iterating the buffers. - * - */ -void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - Iterator in(input_ptr, win); - Iterator out(output_ptr, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld4q_u8(in.ptr()); - uint8x16x3_t ta2; - ta2.val[0] = ta1.val[0]; - ta2.val[1] = ta1.val[1]; - ta2.val[2] = ta1.val[2]; - vst3q_u8(out.ptr(), ta2); - }, - in, out); -} - -/** Convert YUYV to RGB. - * - * @param[in] input Input YUYV data buffer. - * @param[out] output Output RGB buffer. - * @param[in] win Window for iterating the buffers. - * - */ -template -void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - constexpr auto element_size = alpha ? 32 : 24; - constexpr auto shift = yuyv ? 0 : 1; - - Iterator in(input_ptr, win); - Iterator out(output_ptr, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta = vld4q_u8(in.ptr()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - // Convert the uint8x16x4_t to float32x4x4_t - const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]); - const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]); - const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]); - const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]); - - yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - }, - in, out); -} - -/** Convert NV12 to RGB. - * - * @param[in] input Input NV12 data buffer. - * @param[out] output Output RGB buffer. - * @param[in] win Window for iterating the buffers. - * - */ -template -void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - win.validate(); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - constexpr auto element_size = alpha ? 32 : 24; - const auto out_stride = output_ptr->info()->strides_in_bytes().y(); - constexpr auto shift = uv ? 0 : 1; - - // UV's width and height are subsampled - Window win_uv(win); - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - win_uv.validate(); - - Iterator in_y(input_ptr->plane(0), win); - Iterator in_uv(input_ptr->plane(1), win_uv); - Iterator out(output_ptr, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]); - - yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - - yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); - }, - in_y, in_uv, out); -} - -/** Convert IYUV to RGB. - * - * @param[in] input Input IYUV data buffer. - * @param[out] output Output RGB buffer. - * @param[in] win Window for iterating the buffers. - * - */ -template -void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - win.validate(); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - constexpr auto element_size = alpha ? 32 : 24; - const auto out_stride = output_ptr->info()->strides_in_bytes().y(); - - // UV's width and height are subsampled - Window win_uv(win); - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - win_uv.validate(); - - Iterator in_y(input_ptr->plane(0), win); - Iterator in_u(input_ptr->plane(1), win_uv); - Iterator in_v(input_ptr->plane(2), win_uv); - Iterator out(output_ptr, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto *y_top_ptr = in_y.ptr(); - const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y(); - const auto *u_ptr = in_u.ptr(); - const auto *v_ptr = in_v.ptr(); - - // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation -#if defined(__arch64__) - const auto ta0_y_top = vld1q_u8(y_top_ptr); - const auto ta1_y_top = vld1q_u8(y_top_ptr + 16); - const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr); - const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16); - const auto ta_u = vld1q_u8(u_ptr); - const auto ta_v = vld1q_u8(v_ptr); - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top)); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top)); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom)); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom)); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); -#else /* defined(__arch64__) */ - const auto ta_y_top = vld2q_u8(y_top_ptr); - const auto ta_y_bottom = vld2q_u8(y_bottom_ptr); - const auto ta_u = vld1q_u8(u_ptr); - const auto ta_v = vld1q_u8(v_ptr); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_u.val[0] = U0 U2 U4 U6 ... - //ta_v.val[0] = V0 V2 V4 V6 ... - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); -#endif /* defined(__arch64__) */ - - yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - - yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); - }, - in_y, in_u, in_v, out); -} - -/** Convert YUYV to NV12. - * - * @param[in] input Input YUYV data buffer. - * @param[out] output Output NV12 buffer. - * @param[in] win Window for iterating the buffers. - * - */ -template -void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - win.validate(); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - constexpr auto shift = yuyv ? 0 : 1; - - // NV12's UV's width and height are subsampled - Window win_uv(win); - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - win_uv.validate(); - - Iterator in(input_ptr, win); - Iterator out_y(output_ptr->plane(0), win); - Iterator out_uv(output_ptr->plane(1), win_uv); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_top = vld4q_u8(in.ptr()); - const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - uint8x16x2_t yvec; - yvec.val[0] = ta_top.val[0 + shift]; - yvec.val[1] = ta_top.val[2 + shift]; - vst2q_u8(out_y.ptr(), yvec); - - uint8x16x2_t yyvec; - yyvec.val[0] = ta_bottom.val[0 + shift]; - yyvec.val[1] = ta_bottom.val[2 + shift]; - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); - - uint8x16x2_t uvvec; - uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); - uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); - vst2q_u8(out_uv.ptr(), uvvec); - }, - in, out_y, out_uv); -} - -/** Convert IYUV to NV12. - * - * @param[in] input Input IYUV data buffer. - * @param[out] output Output NV12 buffer. - * @param[in] win Window for iterating the buffers. - * - */ -void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - win.validate(); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - // UV's width and height are subsampled - Window win_uv(win); - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - win_uv.validate(); - - Iterator in_y(input_ptr->plane(0), win); - Iterator in_u(input_ptr->plane(1), win_uv); - Iterator in_v(input_ptr->plane(2), win_uv); - Iterator out_y(output_ptr->plane(0), win); - Iterator out_uv(output_ptr->plane(1), win_uv); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - uint8x16x2_t ta_uv; - ta_uv.val[0] = vld1q_u8(in_u.ptr()); - ta_uv.val[1] = vld1q_u8(in_v.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - vst2q_u8(out_uv.ptr(), ta_uv); - }, - in_y, in_u, in_v, out_y, out_uv); -} - -/** Convert NV12 to IYUV. - * - * @param[in] input Input NV12 data buffer. - * @param[out] output Output IYUV buffer. - * @param[in] win Window for iterating the buffers. - * - */ -template -void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - win.validate(); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - constexpr auto shift = uv ? 0 : 1; - - // UV's width and height are subsampled - Window win_uv(win); - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - win_uv.validate(); - - Iterator in_y(input_ptr->plane(0), win); - Iterator in_uv(input_ptr->plane(1), win_uv); - Iterator out_y(output_ptr->plane(0), win); - Iterator out_u(output_ptr->plane(1), win_uv); - Iterator out_v(output_ptr->plane(2), win_uv); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]); - vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]); - }, - in_y, in_uv, out_y, out_u, out_v); -} - -/** Convert YUYV to IYUV. - * - * @param[in] input Input YUYV data buffer. - * @param[out] output Output IYUV buffer. - * @param[in] win Window for iterating the buffers. - * - */ -template -void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - win.validate(); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - constexpr auto shift = yuyv ? 0 : 1; - - // Destination's UV's width and height are subsampled - Window win_uv(win); - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - win_uv.validate(); - - Iterator in(input_ptr, win); - Iterator out_y(output_ptr->plane(0), win); - Iterator out_u(output_ptr->plane(1), win_uv); - Iterator out_v(output_ptr->plane(2), win_uv); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_top = vld4q_u8(in.ptr()); - const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - uint8x16x2_t yvec; - yvec.val[0] = ta_top.val[0 + shift]; - yvec.val[1] = ta_top.val[2 + shift]; - vst2q_u8(out_y.ptr(), yvec); - - uint8x16x2_t yyvec; - yyvec.val[0] = ta_bottom.val[0 + shift]; - yyvec.val[1] = ta_bottom.val[2 + shift]; - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); - - uint8x16_t uvec; - uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); - vst1q_u8(out_u.ptr(), uvec); - - uint8x16_t vvec; - vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); - vst1q_u8(out_v.ptr(), vvec); - }, - in, out_y, out_u, out_v); -} - -/** Convert NV12 to YUV4. - * - * @param[in] input Input NV12 data buffer. - * @param[out] output Output YUV4 buffer. - * @param[in] win Window for iterating the buffers. - * - */ -template -void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - win.validate(); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - constexpr auto shift = uv ? 0 : 1; - - // UV's width and height are subsampled - Window win_uv(win); - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - win_uv.validate(); - - Iterator in_y(input_ptr->plane(0), win); - Iterator in_uv(input_ptr->plane(1), win_uv); - Iterator out_y(output_ptr->plane(0), win); - Iterator out_u(output_ptr->plane(1), win); - Iterator out_v(output_ptr->plane(2), win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - - uint8x16x2_t uvec; - uvec.val[0] = ta_uv.val[0 + shift]; - uvec.val[1] = ta_uv.val[0 + shift]; - vst2q_u8(out_u.ptr(), uvec); - vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); - - uint8x16x2_t vvec; - vvec.val[0] = ta_uv.val[1 - shift]; - vvec.val[1] = ta_uv.val[1 - shift]; - vst2q_u8(out_v.ptr(), vvec); - vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); - }, - in_y, in_uv, out_y, out_u, out_v); -} - -/** Convert IYUV to YUV4. - * - * @param[in] input Input IYUV data buffer. - * @param[out] output Output YUV4 buffer. - * @param[in] win Window for iterating the buffers. - * - */ -void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - win.validate(); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - // UV's width and height are subsampled - Window win_uv(win); - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - win_uv.validate(); - - Iterator in_y(input_ptr->plane(0), win); - Iterator in_u(input_ptr->plane(1), win_uv); - Iterator in_v(input_ptr->plane(2), win_uv); - Iterator out_y(output_ptr->plane(0), win); - Iterator out_u(output_ptr->plane(1), win); - Iterator out_v(output_ptr->plane(2), win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_u = vld1q_u8(in_u.ptr()); - const auto ta_v = vld1q_u8(in_v.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_u = U0 U2 U4 U6 ... - //ta_v = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - - uint8x16x2_t uvec; - uvec.val[0] = ta_u; - uvec.val[1] = ta_u; - vst2q_u8(out_u.ptr(), uvec); - vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); - - uint8x16x2_t vvec; - vvec.val[0] = ta_v; - vvec.val[1] = ta_v; - vst2q_u8(out_v.ptr(), vvec); - vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); - }, - in_y, in_u, in_v, out_y, out_u, out_v); -} - -/** Convert RGB to NV12. - * - * @param[in] input Input RGB data buffer. - * @param[out] output Output NV12 buffer. - * @param[in] win Window for iterating the buffers. - * - */ -template -void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - win.validate(); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - // UV's width and height are subsampled - Window win_uv(win); - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - win_uv.validate(); - - Iterator in(input_ptr, win); - Iterator out_y(output_ptr->plane(0), win); - Iterator out_uv(output_ptr->plane(1), win_uv); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb_top = load_rgb(in.ptr(), alpha); - const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], - ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], - out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), - out_uv.ptr()); - }, - in, out_y, out_uv); -} - -/** Convert RGB to IYUV. - * - * @param[in] input Input RGB data buffer. - * @param[out] output Output IYUV buffer. - * @param[in] win Window for iterating the buffers. - * - */ -template -void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - win.validate(); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - // UV's width and height are subsampled - Window win_uv(win); - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - win_uv.validate(); - - Iterator in(input_ptr, win); - Iterator out_y(output_ptr->plane(0), win); - Iterator out_u(output_ptr->plane(1), win_uv); - Iterator out_v(output_ptr->plane(2), win_uv); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb_top = load_rgb(in.ptr(), alpha); - const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], - ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], - out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), - out_u.ptr(), out_v.ptr()); - }, - in, out_y, out_u, out_v); -} - -/** Convert RGB to YUV4. - * - * @param[in] input Input RGB data buffer. - * @param[out] output Output YUV4 buffer. - * @param[in] win Window for iterating the buffers. - * - */ -template -void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) -{ - ARM_COMPUTE_ERROR_ON(nullptr == input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - win.validate(); - - const auto input_ptr = static_cast(input); - const auto output_ptr = static_cast(output); - - Iterator in(input_ptr, win); - Iterator out_y(output_ptr->plane(0), win); - Iterator out_u(output_ptr->plane(1), win); - Iterator out_v(output_ptr->plane(2), win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb = load_rgb(in.ptr(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], - out_y.ptr(), out_u.ptr(), out_v.ptr()); - }, - in, out_y, out_u, out_v); -} -} // namespace arm_compute diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h deleted file mode 100644 index 5758264b9a..0000000000 --- a/arm_compute/core/NEON/NEFixedPoint.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2017-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEFIXEDPOINT_H -#define ARM_COMPUTE_NEFIXEDPOINT_H - -#include - -namespace arm_compute -{ -/** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements - * - * @param[in] a Float input vector - * @param[in] b Float input vector - * - * @return The lane-by-lane maximum -> float32x4x2 - */ -float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b); -} // namespace arm_compute -#include "arm_compute/core/NEON/NEFixedPoint.inl" -#endif /* ARM_COMPUTE_NEFIXEDPOINT_H */ \ No newline at end of file diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl deleted file mode 100644 index c2c2b25fef..0000000000 --- a/arm_compute/core/NEON/NEFixedPoint.inl +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include -#include - -namespace arm_compute -{ -#ifndef DOXYGEN_SKIP_THIS - -inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b) -{ - float32x4x2_t res = - { - { - vmaxq_f32(a.val[0], b.val[0]), - vmaxq_f32(a.val[1], b.val[1]) - } - }; - return res; -} -#endif /* DOXYGEN_SKIP_THIS */ -} // namespace arm_compute diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h deleted file mode 100644 index b82a9a341c..0000000000 --- a/arm_compute/core/NEON/NEMath.h +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEMATH_H -#define ARM_COMPUTE_NEMATH_H - -#include -#include - -namespace arm_compute -{ -/** Calculate floor of a vector. - * - * @param[in] val Input vector value in F32 format. - * - * @return The calculated floor vector. - */ -float32x4_t vfloorq_f32(float32x4_t val); - -/** Calculate round value of a vector to nearest with ties to even. - * - * @param[in] val Input vector value in F32 format. - * - * @return The calculated round vector. - */ -float32x4_t vroundq_rte_f32(float32x4_t val); - -/** Calculate inverse square root. - * - * @param[in] x Input value. - * - * @return The calculated inverse square root. - */ -float32x2_t vinvsqrt_f32(float32x2_t x); - -/** Calculate inverse square root. - * - * @param[in] x Input value. - * - * @return The calculated inverse square root. - */ -float32x4_t vinvsqrtq_f32(float32x4_t x); - -/** Calculate reciprocal. - * - * @param[in] x Input value. - * - * @return The calculated reciprocal. - */ -float32x2_t vinv_f32(float32x2_t x); - -/** Calculate reciprocal. - * - * @param[in] x Input value. - * - * @return The calculated reciprocal. - */ -float32x4_t vinvq_f32(float32x4_t x); - -/** Perform a 7th degree polynomial approximation using Estrin's method. - * - * @param[in] x Input vector value in F32 format. - * @param[in] coeffs Polynomial coefficients table. - * - * @return The calculated approximation. - */ -float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array &coeffs); - -/** Calculate exponential - * - * @param[in] x Input vector value in F32 format. - * - * @return The calculated exponent. - */ -float32x4_t vexpq_f32(float32x4_t x); - -/** Calculate logarithm - * - * @param[in] x Input vector value in F32 format. - * - * @return The calculated logarithm. - */ -float32x4_t vlogq_f32(float32x4_t x); - -/** Calculate hyperbolic tangent. - * - * tanh(x) = (e^2x - 1)/(e^2x + 1) - * - * @note We clamp x to [-5,5] to avoid overflowing issues. - * - * @param[in] val Input vector value in F32 format. - * - * @return The calculated Hyperbolic Tangent. - */ -float32x4_t vtanhq_f32(float32x4_t val); - -/** Calculate n power of a number. - * - * pow(x,n) = e^(n*log(x)) - * - * @param[in] val Input vector value in F32 format. - * @param[in] n Powers to raise the input to. - * - * @return The calculated power. - */ -float32x4_t vpowq_f32(float32x4_t val, float32x4_t n); - -/** Round to the nearest division by a power-of-two using exponent - * - * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent - * - * @param[in] x Vector of 4 elements - * @param[in] exponent Vector of 4 elements with integer value used to round to nearest division by a power-of-two - * - * @return the nearest division by a power-of-two using exponent - */ -int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent); - -/** Round to the nearest division by a power-of-two using exponent - * - * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent - * - * @param[in] x Vector of 4 elements - * @param[in] exponent Integer value used to round to nearest division by a power-of-two - * - * @return the nearest division by a power-of-two using exponent - */ -int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent); - -/** Round to the nearest division by a power-of-two using exponent - * - * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent - * - * @param[in] x Element to divide. - * @param[in] exponent Integer value used to round to nearest division by a power-of-two - * - * @return the nearest division by a power-of-two using exponent - */ -int32_t rounding_divide_by_pow2(int32_t x, int exponent); - -/** Converts from uint8x16 to float32x4x4_t - * - * @param[in] in Vector of uint8 to be converted - * - * @return Converted vector of float - */ -float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in); - -/** Converts from int8x16 to float32x4x4_t - * - * @param[in] in Vector of int8 to be converted - * - * @return Converted vector of float - */ -float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in); - -/** Converts to float32x4x4_t from the specified templated 16 elements vectors - * - * @param[in] in Vector of float to be converted - * - * @return Converted vector of float - */ -template -float32x4x4_t convert_to_float32x4x4(const T &in); - -/** Converts from two float32x4x3_t to just one uint8x8x3_t - * - * @param[in] in1 First input vector of float to be converted - * @param[in] in2 Second input vector of float to be converted - * @param[out] out Converted output vector uint8 to store the result - */ -void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out); - -/** Converts from two float32x4x4_t to just one uint8x16_t - * - * @param[in] in Vector of float to be converted - * @param[out] out Converted vector of uint8 to store the result - */ -void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out); - -/** Converts from float32x4x4_t to just one int8x16_t - * - * @param[in] in Vector of float to be converted - * @param[out] out Converted vector of uint8 to store the result - */ -void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out); - -/** Calculate sine. - * - * @param[in] val Input vector value in radians, F32 format. - * - * @return The calculated sine. - */ -float32x4_t vsinq_f32(float32x4_t val); - -/** Calculate sine. - * - * @param[in] val Input vector value in radians, F32 format. - * - * @return The calculated sine. - */ -float32x2_t vsin_f32(float32x2_t val); - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -/** Calculate hyperbolic tangent. - * - * tanh(x) = (e^2x - 1)/(e^2x + 1) - * - * @note We clamp x to [-5,5] to avoid overflowing issues. - * - * @param[in] val Input vector value in F16 format. - * - * @return The calculated Hyperbolic Tangent. - */ -float16x8_t vtanhq_f16(float16x8_t val); - -/** Calculate round value of a vector to nearest with ties to even. - * - * @param[in] val Input vector value in F16 format. - * - * @return The calculated round vector. - */ -float16x8_t vroundq_rte_f16(float16x8_t val); - -/** Calculate reciprocal. - * - * @param[in] x Input value. - * - * @return The calculated reciprocal. - */ -float16x4_t vinv_f16(float16x4_t x); - -/** Calculate reciprocal. - * - * @param[in] x Input value. - * - * @return The calculated reciprocal. - */ -float16x8_t vinvq_f16(float16x8_t x); - -/** Calculate inverse square root. - * - * @param[in] x Input value. - * - * @return The calculated inverse square root. - */ -float16x4_t vinvsqrt_f16(float16x4_t x); - -/** Calculate inverse square root. - * - * @param[in] x Input value. - * - * @return The calculated inverse square root. - */ -float16x8_t vinvsqrtq_f16(float16x8_t x); - -/** Calculate exponential - * - * @param[in] x Input vector value in F16 format. - * - * @return The calculated exponent. - */ -float16x8_t vexpq_f16(float16x8_t x); - -/** Calculate n power of a number. - * - * pow(x,n) = e^(n*log(x)) - * - * @param[in] val Input vector value in F16 format. - * @param[in] n Powers to raise the input to. - * - * @return The calculated power. - */ -float16x8_t vpowq_f16(float16x8_t val, float16x8_t n); - -/** Calculate sine. - * - * @param[in] val Input vector value in radians, F16 format. - * - * @return The calculated sine. - */ -float16x8_t vsinq_f16(float16x8_t val); - -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -} // namespace arm_compute -#include "arm_compute/core/NEON/NEMath.inl" -#endif /* ARM_COMPUTE_NEMATH_H */ diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl deleted file mode 100644 index a1c3d41880..0000000000 --- a/arm_compute/core/NEON/NEMath.inl +++ /dev/null @@ -1,529 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include -#include - -#ifndef M_PI -#define M_PI (3.14159265358979323846) -#endif // M_PI - -namespace arm_compute -{ -/** Exponent polynomial coefficients */ -const std::array exp_tab = -{ - { - vdupq_n_f32(1.f), - vdupq_n_f32(0.0416598916054f), - vdupq_n_f32(0.500000596046f), - vdupq_n_f32(0.0014122662833f), - vdupq_n_f32(1.00000011921f), - vdupq_n_f32(0.00833693705499f), - vdupq_n_f32(0.166665703058f), - vdupq_n_f32(0.000195780929062f), - } -}; - -/** Logarithm polynomial coefficients */ -const std::array log_tab = -{ - { - vdupq_n_f32(-2.29561495781f), - vdupq_n_f32(-2.47071170807f), - vdupq_n_f32(-5.68692588806f), - vdupq_n_f32(-0.165253549814f), - vdupq_n_f32(5.17591238022f), - vdupq_n_f32(0.844007015228f), - vdupq_n_f32(4.58445882797f), - vdupq_n_f32(0.0141278216615f), - } -}; - -/** Sin polynomial coefficients */ -constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3) -constexpr float te_sin_coeff3 = 0.05f; // 1/(4*5) -constexpr float te_sin_coeff4 = 0.023809523810f; // 1/(6*7) -constexpr float te_sin_coeff5 = 0.013888888889f; // 1/(8*9) - -#ifndef DOXYGEN_SKIP_THIS -inline float32x4_t vfloorq_f32(float32x4_t val) -{ - static const float32x4_t CONST_1 = vdupq_n_f32(1.f); - - const int32x4_t z = vcvtq_s32_f32(val); - const float32x4_t r = vcvtq_f32_s32(z); - - return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, CONST_1), r); -} - -inline float32x4_t vroundq_rte_f32(float32x4_t val) -{ -#ifdef __aarch64__ - return vrndnq_f32(val); -#else // __aarch64__ - static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f); - static const float32x4_t CONST_1_FLOAT = vdupq_n_f32(1.f); - static const int32x4_t CONST_1_INT = vdupq_n_s32(1); - const float32x4_t floor_val = vfloorq_f32(val); - const float32x4_t diff = vsubq_f32(val, floor_val); - - /* - * Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0). - * This condition is checked by vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))) - */ - - return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))), - floor_val, vaddq_f32(floor_val, CONST_1_FLOAT)); -#endif // __aarch64__ -} - -inline float32x2_t vinvsqrt_f32(float32x2_t x) -{ - float32x2_t sqrt_reciprocal = vrsqrte_f32(x); - sqrt_reciprocal = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - sqrt_reciprocal = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - - return sqrt_reciprocal; -} - -inline float32x4_t vinvsqrtq_f32(float32x4_t x) -{ - float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); - sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - - return sqrt_reciprocal; -} - -inline float32x2_t vinv_f32(float32x2_t x) -{ - float32x2_t recip = vrecpe_f32(x); - recip = vmul_f32(vrecps_f32(x, recip), recip); - recip = vmul_f32(vrecps_f32(x, recip), recip); - return recip; -} - -inline float32x4_t vinvq_f32(float32x4_t x) -{ - float32x4_t recip = vrecpeq_f32(x); - recip = vmulq_f32(vrecpsq_f32(x, recip), recip); - recip = vmulq_f32(vrecpsq_f32(x, recip), recip); - return recip; -} - -inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array &coeffs) -{ - float32x4_t A = vmlaq_f32(coeffs[0], coeffs[4], x); - float32x4_t B = vmlaq_f32(coeffs[2], coeffs[6], x); - float32x4_t C = vmlaq_f32(coeffs[1], coeffs[5], x); - float32x4_t D = vmlaq_f32(coeffs[3], coeffs[7], x); - float32x4_t x2 = vmulq_f32(x, x); - float32x4_t x4 = vmulq_f32(x2, x2); - float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4); - return res; -} - -inline float32x4_t vexpq_f32(float32x4_t x) -{ - static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) - static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2) - static const float32x4_t CONST_INF = vdupq_n_f32(std::numeric_limits::infinity()); - static const float32x4_t CONST_MAX_INPUT = vdupq_n_f32(88.7f); - static const float32x4_t CONST_0 = vdupq_n_f32(0.f); - static const int32x4_t CONST_NEGATIVE_126 = vdupq_n_s32(-126); - - // Perform range reduction [-log(2),log(2)] - int32x4_t m = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2)); - float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2); - - // Polynomial Approximation - float32x4_t poly = vtaylor_polyq_f32(val, exp_tab); - - // Reconstruct - poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23))); - poly = vbslq_f32(vcltq_s32(m, CONST_NEGATIVE_126), CONST_0, poly); // Handle underflow - poly = vbslq_f32(vcgtq_f32(x, CONST_MAX_INPUT), CONST_INF, poly); // Handle overflow - - return poly; -} - -inline float32x4_t vlogq_f32(float32x4_t x) -{ - static const int32x4_t CONST_127 = vdupq_n_s32(127); // 127 - static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) - - // Extract exponent - int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127); - float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23))); - - // Polynomial Approximation - float32x4_t poly = vtaylor_polyq_f32(val, log_tab); - - // Reconstruct - poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2); - - return poly; -} - -inline float32x4_t vtanhq_f32(float32x4_t val) -{ - static const float32x4_t CONST_1 = vdupq_n_f32(1.f); - static const float32x4_t CONST_2 = vdupq_n_f32(2.f); - static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-10.f); - static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(10.f); - - float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH); - float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x)); - float32x4_t num = vsubq_f32(exp2x, CONST_1); - float32x4_t den = vaddq_f32(exp2x, CONST_1); - float32x4_t tanh = vmulq_f32(num, vinvq_f32(den)); - return tanh; -} - -inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n) -{ - return vexpq_f32(vmulq_f32(n, vlogq_f32(val))); -} - -inline float32x4_t vsinq_f32(float32x4_t val) -{ - const float32x4_t pi_v = vdupq_n_f32(M_PI); - const float32x4_t pio2_v = vdupq_n_f32(M_PI / 2); - const float32x4_t ipi_v = vdupq_n_f32(1 / M_PI); - - //Find positive or negative - const int32x4_t c_v = vabsq_s32(vcvtq_s32_f32(vmulq_f32(val, ipi_v))); - const uint32x4_t sign_v = vcleq_f32(val, vdupq_n_f32(0)); - const uint32x4_t odd_v = vandq_u32(vreinterpretq_u32_s32(c_v), vdupq_n_u32(1)); - - uint32x4_t neg_v = veorq_u32(odd_v, sign_v); - - //Modulus a - (n * int(a*(1/n))) - float32x4_t ma = vsubq_f32(vabsq_f32(val), vmulq_f32(pi_v, vcvtq_f32_s32(c_v))); - const uint32x4_t reb_v = vcgeq_f32(ma, pio2_v); - - //Rebase a between 0 and pi/2 - ma = vbslq_f32(reb_v, vsubq_f32(pi_v, ma), ma); - - //Taylor series - const float32x4_t ma2 = vmulq_f32(ma, ma); - - //2nd elem: x^3 / 3! - float32x4_t elem = vmulq_f32(vmulq_f32(ma, ma2), vdupq_n_f32(te_sin_coeff2)); - float32x4_t res = vsubq_f32(ma, elem); - - //3rd elem: x^5 / 5! - elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff3)); - res = vaddq_f32(res, elem); - - //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val) - elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff4)); - res = vsubq_f32(res, elem); - - //5th elem: x^9 / 9! - elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff5)); - res = vaddq_f32(res, elem); - - //Change of sign - neg_v = vshlq_n_u32(neg_v, 31); - res = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(res), neg_v)); - return res; -} - -inline float32x2_t vsin_f32(float32x2_t val) -{ - const float32x2_t pi_v = vdup_n_f32(M_PI); - const float32x2_t pio2_v = vdup_n_f32(M_PI / 2); - const float32x2_t ipi_v = vdup_n_f32(1 / M_PI); - - //Find positive or negative - const int32x2_t c_v = vabs_s32(vcvt_s32_f32(vmul_f32(val, ipi_v))); - const uint32x2_t sign_v = vcle_f32(val, vdup_n_f32(0)); - const uint32x2_t odd_v = vand_u32(vreinterpret_u32_s32(c_v), vdup_n_u32(1)); - - uint32x2_t neg_v = veor_u32(odd_v, sign_v); - - //Modulus a - (n * int(a*(1/n))) - float32x2_t ma = vsub_f32(vabs_f32(val), vmul_f32(pi_v, vcvt_f32_s32(c_v))); - const uint32x2_t reb_v = vcge_f32(ma, pio2_v); - - //Rebase a between 0 and pi/2 - ma = vbsl_f32(reb_v, vsub_f32(pi_v, ma), ma); - - //Taylor series - const float32x2_t ma2 = vmul_f32(ma, ma); - - //2nd elem: x^3 / 3! - float32x2_t elem = vmul_f32(vmul_f32(ma, ma2), vdup_n_f32(te_sin_coeff2)); - float32x2_t res = vsub_f32(ma, elem); - - //3rd elem: x^5 / 5! - elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff3)); - res = vadd_f32(res, elem); - - //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val) - elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff4)); - res = vsub_f32(res, elem); - - //5th elem: x^9 / 9! - elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff5)); - res = vadd_f32(res, elem); - - //Change of sign - neg_v = vshl_n_u32(neg_v, 31); - res = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(res), neg_v)); - return res; -} - -#endif /* DOXYGEN_SKIP_THIS */ - -inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent) -{ - const int32x4_t shift_vec = vnegq_s32(exponent); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); - const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed_up_x, shift_vec); -} - -inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent) -{ - const int32x4_t shift_vec = vdupq_n_s32(-exponent); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); - const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed_up_x, shift_vec); -} - -inline int32_t rounding_divide_by_pow2(int32_t x, int exponent) -{ - const int32_t mask = (1 << exponent) - 1; - const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0); - return (x >> exponent) + ((x & mask) > threshold ? 1 : 0); -} - -inline float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in) -{ - float32x4x4_t out; - - const auto tmp1 = vmovl_u8(vget_low_u8(in)); - out.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1))); - out.val[1] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1))); - - const auto tmp2 = vmovl_u8(vget_high_u8(in)); - out.val[2] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2))); - out.val[3] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2))); - return out; -} - -inline float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in) -{ - float32x4x4_t out; - - const auto tmp1 = vmovl_s8(vget_low_s8(in)); - out.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp1))); - out.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp1))); - - const auto tmp2 = vmovl_s8(vget_high_s8(in)); - out.val[2] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp2))); - out.val[3] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp2))); - return out; -} - -template <> -inline float32x4x4_t convert_to_float32x4x4(const uint8x16_t &in) -{ - return convert_uint8x16_to_float32x4x4(in); -} - -template <> -inline float32x4x4_t convert_to_float32x4x4(const int8x16_t &in) -{ - return convert_int8x16_to_float32x4x4(in); -} - -inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out) -{ - out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), - vqmovn_u32(vcvtq_u32_f32(in2.val[0])))); - out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), - vqmovn_u32(vcvtq_u32_f32(in2.val[1])))); - out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), - vqmovn_u32(vcvtq_u32_f32(in2.val[2])))); -} - -inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out) -{ - const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), - vqmovn_u32(vcvtq_u32_f32(in.val[1]))); - const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), - vqmovn_u32(vcvtq_u32_f32(in.val[3]))); - out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); -} - -inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out) -{ - const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), - vqmovn_s32(vcvtq_s32_f32(in.val[1]))); - const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), - vqmovn_s32(vcvtq_s32_f32(in.val[3]))); - out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -/** Exponent polynomial coefficients */ -/** Logarithm polynomial coefficients */ -#ifndef DOXYGEN_SKIP_THIS -inline float16x8_t vfloorq_f16(float16x8_t val) -{ - static const float16x8_t CONST_1 = vdupq_n_f16(1.f); - - const int16x8_t z = vcvtq_s16_f16(val); - const float16x8_t r = vcvtq_f16_s16(z); - - return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, CONST_1), r); -} - -inline float16x8_t vroundq_rte_f16(float16x8_t val) -{ - return vrndnq_f16(val); -} - -inline float16x4_t vinvsqrt_f16(float16x4_t x) -{ - float16x4_t sqrt_reciprocal = vrsqrte_f16(x); - sqrt_reciprocal = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - sqrt_reciprocal = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - return sqrt_reciprocal; -} - -inline float16x8_t vinvsqrtq_f16(float16x8_t x) -{ - float16x8_t sqrt_reciprocal = vrsqrteq_f16(x); - sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - return sqrt_reciprocal; -} - -inline float16x4_t vinv_f16(float16x4_t x) -{ - float16x4_t recip = vrecpe_f16(x); - recip = vmul_f16(vrecps_f16(x, recip), recip); - recip = vmul_f16(vrecps_f16(x, recip), recip); - return recip; -} - -inline float16x8_t vinvq_f16(float16x8_t x) -{ - float16x8_t recip = vrecpeq_f16(x); - recip = vmulq_f16(vrecpsq_f16(x, recip), recip); - recip = vmulq_f16(vrecpsq_f16(x, recip), recip); - return recip; -} - -inline float16x8_t vtanhq_f16(float16x8_t val) -{ - const float16x8_t CONST_1 = vdupq_n_f16(1.f); - const float16x8_t CONST_2 = vdupq_n_f16(2.f); - const float16x8_t CONST_MIN_TANH = vdupq_n_f16(-10.f); - const float16x8_t CONST_MAX_TANH = vdupq_n_f16(10.f); - - const float16x8_t x = vminq_f16(vmaxq_f16(val, CONST_MIN_TANH), CONST_MAX_TANH); - const float16x8_t exp2x = vexpq_f16(vmulq_f16(CONST_2, x)); - const float16x8_t num = vsubq_f16(exp2x, CONST_1); - const float16x8_t den = vaddq_f16(exp2x, CONST_1); - const float16x8_t tanh = vmulq_f16(num, vinvq_f16(den)); - return tanh; -} - -inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array &coeffs) -{ - const float16x8_t A = vaddq_f16(coeffs[0], vmulq_f16(coeffs[4], x)); - const float16x8_t B = vaddq_f16(coeffs[2], vmulq_f16(coeffs[6], x)); - const float16x8_t C = vaddq_f16(coeffs[1], vmulq_f16(coeffs[5], x)); - const float16x8_t D = vaddq_f16(coeffs[3], vmulq_f16(coeffs[7], x)); - const float16x8_t x2 = vmulq_f16(x, x); - const float16x8_t x4 = vmulq_f16(x2, x2); - const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4)); - return res; -} - -inline float16x8_t vexpq_f16(float16x8_t x) -{ - // TODO (COMPMID-1535) : Revisit FP16 approximations - const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x)); - const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x)); - - const float16x8_t res = vcombine_f16(vcvt_f16_f32(vexpq_f32(x_low)), vcvt_f16_f32(vexpq_f32(x_high))); - return res; -} - -inline float16x8_t vlogq_f16(float16x8_t x) -{ - // TODO (COMPMID-1535) : Revisit FP16 approximations - const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x)); - const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x)); - - const float16x8_t res = vcombine_f16(vcvt_f16_f32(vlogq_f32(x_low)), vcvt_f16_f32(vlogq_f32(x_high))); - return res; -} - -inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n) -{ - // TODO (giaiod01) - COMPMID-1535 - float32x4_t n0_f32 = vcvt_f32_f16(vget_low_f16(n)); - float32x4_t n1_f32 = vcvt_f32_f16(vget_high_f16(n)); - float32x4_t val0_f32 = vcvt_f32_f16(vget_low_f16(val)); - float32x4_t val1_f32 = vcvt_f32_f16(vget_high_f16(val)); - - float32x4_t res0_f32 = vexpq_f32(vmulq_f32(n0_f32, vlogq_f32(val0_f32))); - float32x4_t res1_f32 = vexpq_f32(vmulq_f32(n1_f32, vlogq_f32(val1_f32))); - - return vcombine_f16(vcvt_f16_f32(res0_f32), vcvt_f16_f32(res1_f32)); -} - -inline float16x8_t vsinq_f16(float16x8_t val) -{ - const float32x4_t val_high = vcvt_f32_f16(vget_high_f16(val)); - const float32x4_t val_low = vcvt_f32_f16(vget_low_f16(val)); - - const float32x4_t res_high = vsinq_f32(val_high); - const float32x4_t res_low = vsinq_f32(val_low); - - return vcombine_f16(vcvt_f16_f32(res_low), vcvt_f16_f32(res_high)); -} - -inline float16x4_t vsin_f16(float16x4_t val) -{ - const float32x4_t val_f32 = vcvt_f32_f16(val); - const float32x2_t val_high = vget_high_f32(val_f32); - const float32x2_t val_low = vget_low_f32(val_f32); - - const float32x2_t res_high = vsin_f32(val_high); - const float32x2_t res_low = vsin_f32(val_low); - - return vcvt_f16_f32(vcombine_f32(res_low, res_high)); -} - -#endif /* DOXYGEN_SKIP_THIS */ -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -} // namespace arm_compute diff --git a/arm_compute/core/NEON/NESymm.h b/arm_compute/core/NEON/NESymm.h deleted file mode 100644 index 6dee8705f4..0000000000 --- a/arm_compute/core/NEON/NESymm.h +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NESYMM_H -#define ARM_COMPUTE_NESYMM_H - -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include - -namespace arm_compute -{ -using qsymm8_t = int8_t; /**< 8 bit quantized symmetric scalar value */ -using qsymm16_t = int16_t; /**< 16 bit quantized symmetric scalar value */ - -using qsymm16x8_t = int16x8_t; /**< 16 bit quantized symmetric vector with 8 elements */ -using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 16 elements */ - -/** Performs final quantization step on 8 signed 16-bit elements - * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied - * - * @param[in] in_s32 Input to be quantized. - * @param[in] result_fixedpoint_multiplier Result multiplier parameter - * @param[in] result_shift Result shift parameter - * @param[in] min_s16 Relu lower bound - * @param[in] max_s16 Relu upper bound - * - * @return Quantized values - */ -template -int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32, - int result_fixedpoint_multiplier, - int32_t result_shift, - int16x8_t min_s16, - int16x8_t max_s16) -{ - if(result_shift < 0) - { - in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift)); - in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift)); - - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - } - else - { - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - // Round to the nearest division by a power-of-two using result_shift_s32 - in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); - in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift); - } - - // Convert S32 to S16 - int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])); - - if(is_bounded_relu) - { - out_s16 = vmaxq_s16(out_s16, min_s16); - out_s16 = vminq_s16(out_s16, max_s16); - } - - return out_s16; -} - -/** Performs final quantization step on single signed 16-bit element - * - * @tparam is_bounded_relu Specified if a fused bounded relu should be applied - * - * @param[in] in_value Input to be quantized. - * @param[in] result_fixedpoint_multiplier Result multiplier parameter - * @param[in] result_shift Result shift parameter - * @param[in] min_s16 Relu lower bound - * @param[in] max_s16 Relu upper bound - * - * @return Quantized values - */ -template -inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier, - int32_t result_shift, int16_t min_s16, int16_t max_s16) -{ - if(result_shift < 0) - { - const int64_t in_64 = static_cast(in_value) * (1 << (-result_shift)) * static_cast(result_fixedpoint_multiplier); - in_value = static_cast((in_64 + (1 << 30)) >> 31); - } - else - { - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - const int64_t in_64 = static_cast(in_value) * static_cast(result_fixedpoint_multiplier); - // Shift value by result_shift_s32 - in_value = rounding_divide_by_pow2(static_cast((in_64 + (1 << 30)) >> 31), result_shift); - } - - // Bound the result - int16_t out_s16 = static_cast(std::max(-32768, std::min(32767, in_value))); - - if(is_bounded_relu) - { - out_s16 = static_cast(std::max(min_s16, std::min(max_s16, out_s16))); - } - - return out_s16; -} - -/** Dequantize a neon vector holding 8 16-bit quantized values. - * - * @param[in] qv Input values to be dequantized. - * @param[in] scale Quantization scale - * - * @return Dequantized values in a neon vector - */ -inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale) -{ - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x2_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale) - } - }; - return vdequantized_input; -} - -/** Quantize a neon vector holding 8 floating point values. - * - * @param[in] qv Input values to be quantized. - * @param[in] scale Quantization scale - * - * @return A neon vector holding the quantized values - */ -inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale) -{ - const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - - const int32x4x2_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) -#else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) -#endif //__aarch64__ - } - }; - return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); -} - -/** Dequantize a neon vector holding 16 16-bit quantized values. - * - * @param[in] qv Input values to be dequantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return Dequantized values in a neon vector - */ -inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi) -{ - const float scale = qi.scale; - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale), - } - }; - return vdequantized_input; -} - -/** Quantize a neon vector holding 16 floating point values. - * - * @param[in] qv Input values to be quantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return A neon vector holding the quantized values - */ -inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) -{ - const float scale = qi.scale; - ARM_COMPUTE_ERROR_ON(scale == 0.f); - const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), -#else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), -#endif //__aarch64__ - } - }; - const qsymm16x8x2_t res = - { - vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])), - vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])), - }; - - return res; -} - -/** Multiply a neon vector using quantized multiplier and shift - * - * @param[in] input Input vector to mutiply values to be quantized. - * @param[in] qmul Quantized multipler - * @param[in] shift Left bit shift - * - * @return A neon vector holding the multiplied value - */ -inline int32x4x2_t multiply_by_quantized_multiplier_2row(int32x4x2_t input, int32_t qmul, int32_t shift) -{ - const auto left_shift = shift > 0 ? shift : 0; - const auto right_shift = shift > 0 ? 0 : -shift; - const auto one_shifted = 1 << left_shift; - - int32x4x2_t result; - result.val[0] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[0], one_shifted), qmul), right_shift); - result.val[1] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[1], one_shifted), qmul), right_shift); - - return result; -} - -} // namespace arm_compute -#endif // ARM_COMPUTE_NESYMM_H diff --git a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h deleted file mode 100644 index 067a18cb62..0000000000 --- a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H -#define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H - -#include "arm_compute/core/NEON/wrapper/wrapper.h" - -namespace arm_compute -{ -namespace detail -{ -/** Dummy activation object */ -template -struct dummy -{ - /** NEON vector type. */ - using ExactType = typename wrapper::traits::neon_vector::type; - - /** Construct a dummy activation object. - * - * @param[in] act_info Activation layer information. - */ - explicit dummy(ActivationLayerInfo act_info) - { - ARM_COMPUTE_UNUSED(act_info); - } - - /** Run activation function. - * - * @param[in] vval Vector of values. - */ - void operator()(ExactType &vval) - { - ARM_COMPUTE_UNUSED(vval); - } - - /** Run activation function. - * - * @param[in] val Scalar value. - */ - void operator()(T &val) - { - ARM_COMPUTE_UNUSED(val); - } -}; -/** Linear activation object */ -template -struct linear -{ - /** NEON vector type. */ - using ExactType = typename wrapper::traits::neon_vector::type; - /** NEON vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - /** Construct a Linear activation object. - * - * @param[in] act_info Activation layer information. - */ - explicit linear(ActivationLayerInfo act_info) - : alpha(act_info.a()), - beta(act_info.b()), - valpha(wrapper::vdup_n(static_cast(alpha), ExactTagType{})), - vbeta(wrapper::vdup_n(static_cast(beta), ExactTagType{})) - { - } - - /** Run activation function. - * - * @param[in] vval Vector of values. - */ - void operator()(ExactType &vval) - { - vval = wrapper::vmla(vbeta, vval, valpha); - } - - /** Run activation function. - * - * @param[in] val Scalar value. - */ - void operator()(T &val) - { - val = alpha * val + beta; - } - - const T alpha; /**< Scalar alpha */ - const T beta; /**< Scalar alpha */ - const ExactType valpha; /**< Vector of alphas. */ - const ExactType vbeta; /**< Vector of betas. */ -}; -/** Square activation object */ -template -struct square -{ - /** NEON vector type. */ - using ExactType = typename wrapper::traits::neon_vector::type; - /** NEON vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - /** Construct a Square activation object. - * - * @param[in] act_info Activation layer information. - */ - explicit square(ActivationLayerInfo act_info) - { - ARM_COMPUTE_UNUSED(act_info); - } - - /** Run activation function. - * - * @param[in] vval Vector of values. - */ - void operator()(ExactType &vval) - { - vval = wrapper::vmul(vval, vval); - } - - /** Run activation function. - * - * @param[in] val Scalar value. - */ - void operator()(T &val) - { - val = val * val; - } -}; -/** Logistic activation object */ -template -struct logistic -{ - /** NEON vector type. */ - using ExactType = typename wrapper::traits::neon_vector::type; - /** NEON vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - /** Construct a Logistic activation object. - * - * @param[in] act_info Activation layer information. - */ - explicit logistic(ActivationLayerInfo act_info) - : vone(wrapper::vdup_n(static_cast(1), ExactTagType{})) - { - ARM_COMPUTE_UNUSED(act_info); - } - - /** Run activation function. - * - * @param[in] vval Vector of values. - */ - void operator()(ExactType &vval) - { - vval = wrapper::vinv(wrapper::vadd(vone, wrapper::vexpq(wrapper::vneg(vval)))); - } - - /** Run activation function. - * - * @param[in] val Scalar value. - */ - void operator()(T &val) - { - val = 1 / (1 + std::exp(-val)); - } - - /** Vector of ones. */ - const ExactType vone; -}; -/** RELU activation object */ -template -struct relu -{ - /** NEON vector type. */ - using ExactType = typename wrapper::traits::neon_vector::type; - /** NEON vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - /** Construct a RELU activation object. - * - * @param[in] act_info Activation layer information. - */ - explicit relu(ActivationLayerInfo act_info) - : vzero(wrapper::vdup_n(static_cast(0), ExactTagType{})) - { - ARM_COMPUTE_UNUSED(act_info); - } - - /** Run activation function. - * - * @param[in] vval Vector of values. - */ - void operator()(ExactType &vval) - { - vval = wrapper::vmax(vzero, vval); - } - - /** Run activation function. - * - * @param[in] val Scalar value. - */ - void operator()(T &val) - { - val = std::max(static_cast(0), val); - } - - /** Vector of zeroes. */ - const ExactType vzero; -}; -/** Bounded RELU activation object */ -template -struct brelu -{ - /** NEON vector type. */ - using ExactType = typename wrapper::traits::neon_vector::type; - /** NEON vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - /** Construct a bounded RELU activation object. - * - * @param[in] act_info Activation layer information. - */ - explicit brelu(ActivationLayerInfo act_info) - : alpha(act_info.a()), - vzero(wrapper::vdup_n(static_cast(0), ExactTagType{})), - valpha(wrapper::vdup_n(static_cast(act_info.a()), ExactTagType{})) - { - } - - /** Run activation function. - * - * @param[in] vval Vector of values. - */ - void operator()(ExactType &vval) - { - vval = wrapper::vmin(valpha, wrapper::vmax(vzero, vval)); - } - - /** Run activation function. - * - * @param[in] val Scalar value. - */ - void operator()(T &val) - { - val = std::min(alpha, std::max(static_cast(0), val)); - } - - const T alpha; /** Scalar alpha */ - const ExactType vzero; /** Vector of zeroes. */ - const ExactType valpha; /** Vector of alphas. */ -}; -/** Lower-Upper Bounded RELU activation object */ -template -struct lubrelu -{ - /** NEON vector type. */ - using ExactType = typename wrapper::traits::neon_vector::type; - /** NEON vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - /** Construct a lower-upper bounded RELU activation object. - * - * @param[in] act_info Activation layer information. - */ - explicit lubrelu(ActivationLayerInfo act_info) - : alpha(act_info.a()), - beta(act_info.b()), - valpha(wrapper::vdup_n(static_cast(act_info.a()), ExactTagType{})), - vbeta(wrapper::vdup_n(static_cast(act_info.b()), ExactTagType{})) - { - } - - /** Run activation function. - * - * @param[in] vval Vector of values. - */ - void operator()(ExactType &vval) - { - vval = wrapper::vmin(valpha, wrapper::vmax(vbeta, vval)); - } - - /** Run activation function. - * - * @param[in] val Scalar value. - */ - void operator()(T &val) - { - val = std::min(alpha, std::max(beta, val)); - } - - const T alpha; /**< Scalar alpha */ - const T beta; /**< Scalar alpha */ - const ExactType valpha; /** Vector of alphas. */ - const ExactType vbeta; /** Vector of betas. */ -}; -} // namespace detail -} // namespace arm_compute -#endif /* ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H */ diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h deleted file mode 100644 index 41ad8fc706..0000000000 --- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2017-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H -#define ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H - -#include - -namespace arm_compute -{ -namespace detail -{ -inline float32x4x3_t load_matrix_row(const float *ptr) -{ - const float32x4x3_t r = - { - { - vld1q_dup_f32(ptr), - vld1q_dup_f32(1 + ptr), - vld1q_dup_f32(2 + ptr) - } - }; - return r; -} - -template -float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2); - -template <> -inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) -{ - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + 4), - vld1q_f32(in_top + 8) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + 4), - vld1q_f32(in_mid + 8) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + 4), - vld1q_f32(in_low + 8) - } - }; - float32x4x2_t out = - { - { - vmulq_f32(vtop.val[0], m0.val[0]), - vmulq_f32(vtop.val[1], m0.val[0]) - } - }; - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); - - out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]); - - out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]); - - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]); - - out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]); - - out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); - return out; -} - -template <> -inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) -{ - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); - return out; -} - -template <> -inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) -{ - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); - return out; -} - -template -void store_results(float *buffer, const float32x4x2_t &values); - -template <> -void store_results<1>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, values.val[0]); - vst1q_f32(buffer + 4, values.val[1]); -} - -template <> -void store_results<2>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, values.val[0]); -} - -template <> -void store_results<3>(float *buffer, const float32x4x2_t &values) -{ - vst1_f32(buffer, vget_low_f32(values.val[0])); -} - -template -int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration); - -template <> -int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration) -{ - return num_elems_written_per_iteration; -} - -template <> -int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration) -{ - return num_elems_written_per_iteration << 1; -} - -template <> -int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration) -{ - return num_elems_written_per_iteration * 3; -} -} -} // namespace arm_compute -#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */ \ No newline at end of file diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h deleted file mode 100644 index 78f08fdca6..0000000000 --- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h +++ /dev/null @@ -1,965 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H -#define ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" -#include "arm_compute/core/utils/misc/Requires.h" - -#include - -namespace arm_compute -{ -namespace detail -{ -/** Loads a 3x3 matrix as a row (float). - * - * @param[in] ptr Pointer to a float 3x3 matrix. - * @param[in] weights_offset (Optional) Weights quantization offset. - * - * @return The loaded matrix. - */ -inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) -{ - ARM_COMPUTE_UNUSED(weights_offset); - const float32x4x3_t r = - { - { - vld1q_dup_f32(ptr), - vld1q_dup_f32(1 + ptr), - vld1q_dup_f32(2 + ptr) - } - }; - return r; -} - -/** Loads a 3x3 matrix as a row (uint8_t/int8_t). - * - * @param[in] ptr Pointer to a uint8_t/int8_t 3x3 matrix. - * @param[in] weights_offset (Optional) Weights quantization offset. - * - * @return The loaded matrix. - */ -template < typename T, REQUIRES_TA(std::is_same::value || std::is_same::value) > -inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0) -{ - const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset); - - /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: - r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ - int32x4x3_t r = - { - { - vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)), - vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))), - vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2))) - } - }; - return r; -} - -/** Stores a float32x4x2_t array into a memory location. - * - * @param[in] buffer Pointer to the memory location where the values will be stored. - * @param[in] values Values that will be stored. - * - */ -template -void store_results(float *buffer, const float32x4x2_t &values); - -template <> -inline void store_results<1>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, values.val[0]); - vst1q_f32(buffer + 4, values.val[1]); -} - -template <> -inline void store_results<2>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, values.val[0]); -} - -template <> -inline void store_results<3>(float *buffer, const float32x4x2_t &values) -{ - vst1_f32(buffer, vget_low_f32(values.val[0])); -} - -/** Stores a uint32_t array into a memory location. - * - * @param[in] buffer Pointer to the memory location where the values will be stored. - * @param[in] values Values that will be stored. - * - */ -template -void store_results(int32_t *buffer, const int32x4x2_t &values); - -template <> -inline void store_results<1>(int32_t *buffer, const int32x4x2_t &values) -{ - vst1q_s32(buffer, values.val[0]); - vst1q_s32(buffer + 4, values.val[1]); -} - -template <> -inline void store_results<2>(int32_t *buffer, const int32x4x2_t &values) -{ - vst1q_s32(buffer, values.val[0]); -} - -template <> -inline void store_results<3>(int32_t *buffer, const int32x4x2_t &values) -{ - vst1_s32(buffer, vget_low_s32(values.val[0])); -} - -template -inline void accumulate_results(float *buffer, const float32x4x2_t &values); - -template <> -inline void accumulate_results<1>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); - vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1])); -} - -template <> -inline void accumulate_results<2>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); -} - -template <> -inline void accumulate_results<3>(float *buffer, const float32x4x2_t &values) -{ - vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0]))); -} - -template -void accumulate_results(int32_t *buffer, const int32x4x2_t &values); - -template <> -inline void accumulate_results<1>(int32_t *buffer, const int32x4x2_t &values) -{ - vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0])); - vst1q_s32(buffer + 4, vaddq_s32(vld1q_s32(buffer + 4), values.val[1])); -} - -template <> -inline void accumulate_results<2>(int32_t *buffer, const int32x4x2_t &values) -{ - vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0])); -} - -template <> -inline void accumulate_results<3>(int32_t *buffer, const int32x4x2_t &values) -{ - vst1_s32(buffer, vadd_s32(vld1_s32(buffer), vget_low_s32(values.val[0]))); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -/** Stores a float16x8x2_t array into a memory location. - * - * @param[in] buffer Pointer to the memory location where the values will be stored. - * @param[in] values Values that will be stored. - * - */ -template -void store_results(float16_t *buffer, const float16x8x2_t &values); - -template <> -inline void store_results<1>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1q_f16(buffer, values.val[0]); - vst1q_f16(buffer + 8, values.val[1]); -} - -template <> -inline void store_results<2>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1q_f16(buffer, values.val[0]); -} - -template <> -inline void store_results<3>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1_f16(buffer, vget_low_f16(values.val[0])); -} - -template -inline void accumulate_results(float16_t *buffer, const float16x8x2_t &values); - -template <> -inline void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); - vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1])); -} - -template <> -inline void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); -} - -template <> -inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0]))); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -/** Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. - * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] dilation_x Dilation, in elements across x. - * @param[in] input_offset (Optional) Input quantization offset. - * - */ -inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - const size_t dilation_x, int input_offset) -{ - ARM_COMPUTE_UNUSED(input_offset); - - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + dilation_x), - vld1q_f32(in_top + 2 * dilation_x) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + dilation_x), - vld1q_f32(in_mid + 2 * dilation_x) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + dilation_x), - vld1q_f32(in_low + 2 * dilation_x) - } - }; - float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]); - out = vmlaq_f32(out, vtop.val[1], m0.val[1]); - out = vmlaq_f32(out, vtop.val[2], m0.val[2]); - - out = vmlaq_f32(out, vmid.val[0], m1.val[0]); - out = vmlaq_f32(out, vmid.val[1], m1.val[1]); - out = vmlaq_f32(out, vmid.val[2], m1.val[2]); - - out = vmlaq_f32(out, vlow.val[0], m2.val[0]); - out = vmlaq_f32(out, vlow.val[1], m2.val[1]); - out = vmlaq_f32(out, vlow.val[2], m2.val[2]); - - return out; -} - -/** Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. - * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] dilation_x Dilation, in elements across x. - * @param[in] stridex Stride value in elements across x. - * @param[in] input_offset (Optional) Input quantization offset. - * - */ -inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset = 0) -{ - ARM_COMPUTE_ERROR_ON(stridex > 3); - float32x4x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) - } - }; - - if(stridex == 2) - { - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); - } - else if(stridex == 3) - { - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); - } - - return out; -} - -/** Perform a convolve3x3 on float32. - * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[out] out_ptr Pointer to the output. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] stridex Stride value in elements across x. - * @param[in] input_offset (Optional) Input quantization offset. - * - */ -template -void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - unsigned int stridex, int input_offset = 0); - -template -inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - unsigned int stridex, int input_offset) -{ - ARM_COMPUTE_UNUSED(input_offset); - ARM_COMPUTE_ERROR_ON(stridex > 3); - - float32x4x2_t out = - { - { - vdupq_n_f32(0.f), - vdupq_n_f32(0.f) - } - }; - if(stridex == 2) - { - const float32x4x2_t vtop = vld2q_f32(in_top); - const float32x4x2_t vmid = vld2q_f32(in_mid); - const float32x4x2_t vlow = vld2q_f32(in_low); - const float32x4_t vtop_end = vld1q_f32(in_top + 8); - const float32x4_t vmid_end = vld1q_f32(in_mid + 8); - const float32x4_t vlow_end = vld1q_f32(in_low + 8); - - out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); - - out.val[0] = vmlaq_f32(out.val[0], vtop.val[1], m0.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop_end, 1), m0.val[2]); - - out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vmid.val[1], m1.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid_end, 1), m1.val[2]); - - out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vlow.val[1], m2.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow_end, 1), m2.val[2]); - - accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); - } - else - { - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + 4), - vld1q_f32(in_top + 8) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + 4), - vld1q_f32(in_mid + 8) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + 4), - vld1q_f32(in_low + 8) - } - }; - out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); - out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]); - - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); - - out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]); - - out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]); - - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]); - - out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]); - - out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); - - if(stridex == 3) - { - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); - accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); - } - else - { - accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); - } - } -} - -/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. - * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] dilation_x Dilation, in elements across x. - * @param[in] input_offset Input quantization offset. - * - */ -template < typename T, REQUIRES_TA(std::is_same::value || std::is_same::value) > -inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, - const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - size_t dilation_x, int32_t input_offset) -{ - using VectorType = typename std::conditional::value, uint8x8x3_t, int8x8x3_t>::type; - using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t; - - const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); - - const VectorType vtop = - { - { - wrapper::vload(in_top), - wrapper::vload(in_top + dilation_x), - wrapper::vload(in_top + 2 * dilation_x) - } - }; - const VectorType vmid = - { - { - wrapper::vload(in_mid), - wrapper::vload(in_mid + dilation_x), - wrapper::vload(in_mid + 2 * dilation_x) - } - }; - const VectorType vlow = - { - { - wrapper::vload(in_low), - wrapper::vload(in_low + dilation_x), - wrapper::vload(in_low + 2 * dilation_x) - } - }; - - const int32x4x3_t vtop_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))), - } - }; - const int32x4x3_t vmid_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))), - } - }; - const int32x4x3_t vlow_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))), - } - }; - - int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]); - out = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]); - out = wrapper::vmla(out, vtop_s32.val[2], m0.val[2]); - - out = wrapper::vmla(out, vmid_s32.val[0], m1.val[0]); - out = wrapper::vmla(out, vmid_s32.val[1], m1.val[1]); - out = wrapper::vmla(out, vmid_s32.val[2], m1.val[2]); - - out = wrapper::vmla(out, vlow_s32.val[0], m2.val[0]); - out = wrapper::vmla(out, vlow_s32.val[1], m2.val[1]); - out = wrapper::vmla(out, vlow_s32.val[2], m2.val[2]); - - return out; -} - -/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. - * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] dilation_x Dilation, in elements across x. - * @param[in] stridex Stride value in elements across x. - * @param[in] input_offset Input quantization offset. - * - */ -template < typename T, REQUIRES_TA(std::is_same::value || std::is_same::value) > -inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset) -{ - ARM_COMPUTE_ERROR_ON(stridex > 3); - int32x4x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) - } - }; - - if(stridex == 2) - { - out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); - out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); - out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3); - } - else if(stridex == 3) - { - out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); - } - return out; -} - -/** Perform a convolve3x3 on 8-bit elements - * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[out] out_ptr Pointer to the output. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] stridex Stride value in elements across x. - * @param[in] input_offset Input quantization offset. - * - */ -template < bool accumulate, typename T1, typename T2, REQUIRES_TA(std::is_same::value || std::is_same::value) > -void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr, - const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - unsigned int stridex, int32_t input_offset) -{ - ARM_COMPUTE_ERROR_ON(stridex > 3); - using VectorType = typename std::conditional::value, uint8x8x2_t, int8x8x2_t>::type; - using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t; - - const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); - - const VectorType vtop = - { - { - wrapper::vload(in_top), - wrapper::vload(in_top + 8) - } - }; - const VectorType vmid = - { - { - wrapper::vload(in_mid), - wrapper::vload(in_mid + 8) - } - }; - const VectorType vlow = - { - { - wrapper::vload(in_low), - wrapper::vload(in_low + 8) - } - }; - - const int32x4x3_t vtop_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), - } - }; - const int32x4x3_t vmid_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), - } - }; - const int32x4x3_t vlow_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), - } - }; - - int32x4x2_t out - { - { - wrapper::vdup_n(static_cast(0), OutputTagType{}), - wrapper::vdup_n(static_cast(0), OutputTagType{}), - } - }; - - // 0 - out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]); - out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]); - out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]); - - out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]); - out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]); - out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]); - - out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]); - out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]); - out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]); - - // 1 - out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]); - out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]); - out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]); - - out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]); - out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]); - out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]); - - out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]); - out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]); - out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]); - - if(stridex == 1) - { - accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); - } - else if(stridex == 2) - { - out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); - out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); - out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3); - - accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); - } - else if(stridex == 3) - { - out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); - accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); - } -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -/** Loads a 3x3 matrix as a row (float16_t). - * - * @param[in] ptr Pointer to a float 3x3 matrix. - * - * @return The loaded matrix. - */ -inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = 0) -{ - ARM_COMPUTE_UNUSED(weights_offset); - /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: - r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ - const float16x8x3_t r = - { - { - vld1q_dup_f16(ptr), - vld1q_dup_f16(1 + ptr), - vld1q_dup_f16(2 + ptr) - } - }; - return r; -} - -/** Perform a 3x3 convolution for 8 consecutive elements on float16 when dilation.x() or dilation.y() is not 1. - * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] dilation_x Dilation, in elements across x. - * @param[in] input_offset (Optional)Input quantization offset. - * - */ -inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - const size_t dilation_x, int input_offset = 0) -{ - ARM_COMPUTE_UNUSED(input_offset); - const float16x8x3_t vtop = - { - { - vld1q_f16(in_top), - vld1q_f16(in_top + dilation_x), - vld1q_f16(in_top + 2 * dilation_x) - } - }; - const float16x8x3_t vmid = - { - { - vld1q_f16(in_mid), - vld1q_f16(in_mid + dilation_x), - vld1q_f16(in_mid + 2 * dilation_x) - } - }; - const float16x8x3_t vlow = - { - { - vld1q_f16(in_low), - vld1q_f16(in_low + dilation_x), - vld1q_f16(in_low + 2 * dilation_x) - } - }; - float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]); - out = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1])); - out = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2])); - - out = vaddq_f16(out, vmulq_f16(vmid.val[0], m1.val[0])); - out = vaddq_f16(out, vmulq_f16(vmid.val[1], m1.val[1])); - out = vaddq_f16(out, vmulq_f16(vmid.val[2], m1.val[2])); - - out = vaddq_f16(out, vmulq_f16(vlow.val[0], m2.val[0])); - out = vaddq_f16(out, vmulq_f16(vlow.val[1], m2.val[1])); - out = vaddq_f16(out, vmulq_f16(vlow.val[2], m2.val[2])); - - return out; -} - -/** Perform a 3x3 convolution for 16 consecutive elements on float16 when dilation.x() or dilation.y() is not 1. - * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] dilation_x Dilation, in elements across x. - * @param[in] stridex Stride value in elements across x. - * @param[in] input_offset (Optional) Input quantization offset. - * - */ -inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset = 0) -{ - float16x8x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset) - } - }; - - if(stridex == 2) - { - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 3); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 4); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 5); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7); - } - else if(stridex == 3) - { - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3); - } - - return out; -} - -/** Perform a convolve3x3 on float16. - * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[out] out_ptr Pointer to the output. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] stridex Stride value in elements across x. - * @param[in] input_offset (Optional) Input quantization offset. - * - */ -template -inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - unsigned int stridex, int input_offset = 0) -{ - ARM_COMPUTE_UNUSED(input_offset); - - float16x8x2_t out = - { - { - vdupq_n_f16(0), - vdupq_n_f16(0) - } - }; - if(stridex == 2) - { - const float16x8x2_t vtop = vld2q_f16(in_top); - const float16x8x2_t vmid = vld2q_f16(in_mid); - const float16x8x2_t vlow = vld2q_f16(in_low); - const float16x8_t vtop_end = vld1q_f16(in_top + 16); - const float16x8_t vmid_end = vld1q_f16(in_mid + 16); - const float16x8_t vlow_end = vld1q_f16(in_low + 16); - - out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); - - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vtop.val[1], m0.val[1])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop_end, 1), m0.val[2])); - - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[1], m1.val[1])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid_end, 1), m1.val[2])); - - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[1], m2.val[1])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow_end, 1), m2.val[2])); - - accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); - } - else - { - const float16x8x3_t vtop = - { - { - vld1q_f16(in_top), - vld1q_f16(in_top + 8), - vld1q_f16(in_top + 16) - } - }; - const float16x8x3_t vmid = - { - { - vld1q_f16(in_mid), - vld1q_f16(in_mid + 8), - vld1q_f16(in_mid + 16) - } - }; - const float16x8x3_t vlow = - { - { - vld1q_f16(in_low), - vld1q_f16(in_low + 8), - vld1q_f16(in_low + 16) - } - }; - out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); - out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]); - - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1])); - out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2])); - out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1])); - out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2])); - out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0])); - out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1])); - out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2])); - out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0])); - out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1])); - out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2])); - - if(stridex == 3) - { - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3); - - accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); - } - else - { - accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); - } - } -} -#endif /** __ARM_FEATURE_FP16_VECTOR_ARITHMETIC **/ - -/** Get the number of elements processed on 3x3 convolution. - * - * @param[in] num_elems_written_per_iteration Number of elements written per iteration on 3x3 convolution. - * @param[in] stridex Stride value in elements across x. - * - * @return The number of elements processed. - */ -inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex) -{ - switch(stridex) - { - case 1: - return num_elems_written_per_iteration; - case 2: - return num_elems_written_per_iteration << 1; - case 3: - return num_elems_written_per_iteration * 3; - default: - ARM_COMPUTE_ERROR("stridex not supported"); - return 0; - } -} -} -} // namespace arm_compute -#endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/abs.h b/arm_compute/core/NEON/wrapper/intrinsics/abs.h deleted file mode 100644 index 6927fa64a5..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/abs.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_ABS_H -#define ARM_COMPUTE_WRAPPER_ABS_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VABS_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vabs(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -#define VQABS_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vqabs(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -// Absolute: vabs{q}_. Vd[i] = |Va[i]| -VABS_IMPL(int8x8_t, int8x8_t, vabs, s8) -VABS_IMPL(int16x4_t, int16x4_t, vabs, s16) -VABS_IMPL(int32x2_t, int32x2_t, vabs, s32) -VABS_IMPL(float32x2_t, float32x2_t, vabs, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VABS_IMPL(float16x4_t, float16x4_t, vabs, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VABS_IMPL(int8x16_t, int8x16_t, vabsq, s8) -VABS_IMPL(int16x8_t, int16x8_t, vabsq, s16) -VABS_IMPL(int32x4_t, int32x4_t, vabsq, s32) -VABS_IMPL(float32x4_t, float32x4_t, vabsq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VABS_IMPL(float16x8_t, float16x8_t, vabsq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -// Saturating absolute: vqabs{q}_. Vd[i] = sat(|Va[i]|) -VQABS_IMPL(int8x8_t, int8x8_t, vqabs, s8) -VQABS_IMPL(int16x4_t, int16x4_t, vqabs, s16) -VQABS_IMPL(int32x2_t, int32x2_t, vqabs, s32) - -VQABS_IMPL(int8x16_t, int8x16_t, vqabsq, s8) -VQABS_IMPL(int16x8_t, int16x8_t, vqabsq, s16) -VQABS_IMPL(int32x4_t, int32x4_t, vqabsq, s32) - -#undef VABS_IMPL -#undef VQABS_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_ABS_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/add.h b/arm_compute/core/NEON/wrapper/intrinsics/add.h deleted file mode 100644 index 5bca891da5..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/add.h +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_ADD_H -#define ARM_COMPUTE_WRAPPER_ADD_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VADD_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vadd(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VADD_IMPL(uint8x8_t, uint8x8_t, vadd, u8) -VADD_IMPL(int8x8_t, int8x8_t, vadd, s8) -VADD_IMPL(uint16x4_t, uint16x4_t, vadd, u16) -VADD_IMPL(int16x4_t, int16x4_t, vadd, s16) -VADD_IMPL(uint32x2_t, uint32x2_t, vadd, u32) -VADD_IMPL(int32x2_t, int32x2_t, vadd, s32) -VADD_IMPL(uint64x1_t, uint64x1_t, vadd, u64) -VADD_IMPL(int64x1_t, int64x1_t, vadd, s64) -VADD_IMPL(float32x2_t, float32x2_t, vadd, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VADD_IMPL(float16x4_t, float16x4_t, vadd, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VADD_IMPL(uint8x16_t, uint8x16_t, vaddq, u8) -VADD_IMPL(int8x16_t, int8x16_t, vaddq, s8) -VADD_IMPL(uint16x8_t, uint16x8_t, vaddq, u16) -VADD_IMPL(int16x8_t, int16x8_t, vaddq, s16) -VADD_IMPL(uint32x4_t, uint32x4_t, vaddq, u32) -VADD_IMPL(int32x4_t, int32x4_t, vaddq, s32) -VADD_IMPL(uint64x2_t, uint64x2_t, vaddq, u64) -VADD_IMPL(int64x2_t, int64x2_t, vaddq, s64) -VADD_IMPL(float32x4_t, float32x4_t, vaddq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VADD_IMPL(float16x8_t, float16x8_t, vaddq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#undef VADD_IMPL - -// VQADD: Vector saturating add (No notion of saturation for floating point) -#define VQADD_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vqadd(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VQADD_IMPL(uint8x8_t, uint8x8_t, vqadd, u8) -VQADD_IMPL(int8x8_t, int8x8_t, vqadd, s8) -VQADD_IMPL(uint16x4_t, uint16x4_t, vqadd, u16) -VQADD_IMPL(int16x4_t, int16x4_t, vqadd, s16) -VQADD_IMPL(uint32x2_t, uint32x2_t, vqadd, u32) -VQADD_IMPL(int32x2_t, int32x2_t, vqadd, s32) -VQADD_IMPL(uint64x1_t, uint64x1_t, vqadd, u64) -VQADD_IMPL(int64x1_t, int64x1_t, vqadd, s64) -VQADD_IMPL(float32x2_t, float32x2_t, vadd, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VQADD_IMPL(float16x4_t, float16x4_t, vadd, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VQADD_IMPL(uint8x16_t, uint8x16_t, vqaddq, u8) -VQADD_IMPL(int8x16_t, int8x16_t, vqaddq, s8) -VQADD_IMPL(uint16x8_t, uint16x8_t, vqaddq, u16) -VQADD_IMPL(int16x8_t, int16x8_t, vqaddq, s16) -VQADD_IMPL(uint32x4_t, uint32x4_t, vqaddq, u32) -VQADD_IMPL(int32x4_t, int32x4_t, vqaddq, s32) -VQADD_IMPL(uint64x2_t, uint64x2_t, vqaddq, u64) -VQADD_IMPL(int64x2_t, int64x2_t, vqaddq, s64) -VQADD_IMPL(float32x4_t, float32x4_t, vaddq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VQADD_IMPL(float16x8_t, float16x8_t, vaddq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#undef VQADD_IMPL - -// VADDW: Vector widening add -#define VADDW_IMPL(wtype, vtype, prefix, postfix) \ - inline wtype vaddw(const wtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VADDW_IMPL(uint16x8_t, uint8x8_t, vaddw, u8) -VADDW_IMPL(int16x8_t, int8x8_t, vaddw, s8) -VADDW_IMPL(uint32x4_t, uint16x4_t, vaddw, u16) -VADDW_IMPL(int32x4_t, int16x4_t, vaddw, s16) -VADDW_IMPL(uint64x2_t, uint32x2_t, vaddw, u32) -VADDW_IMPL(int64x2_t, int32x2_t, vaddw, s32) -#undef VADDW_IMPL - -// VADDL: Vector long add -#define VADDL_IMPL(wtype, vtype, prefix, postfix) \ - inline wtype vaddl(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VADDL_IMPL(uint16x8_t, uint8x8_t, vaddl, u8) -VADDL_IMPL(int16x8_t, int8x8_t, vaddl, s8) -VADDL_IMPL(uint32x4_t, uint16x4_t, vaddl, u16) -VADDL_IMPL(int32x4_t, int16x4_t, vaddl, s16) -VADDL_IMPL(uint64x2_t, uint32x2_t, vaddl, u32) -VADDL_IMPL(int64x2_t, int32x2_t, vaddl, s32) -#undef VADDL_IMPL - -#if defined(__aarch64__) -// VADDV: Across vector add -#define VADDV_IMPL(stype, vtype, prefix, postfix) \ - inline stype vaddv(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -VADDV_IMPL(uint8_t, uint8x8_t, vaddv, u8) -VADDV_IMPL(int8_t, int8x8_t, vaddv, s8) -VADDV_IMPL(uint16_t, uint16x4_t, vaddv, u16) -VADDV_IMPL(int16_t, int16x4_t, vaddv, s16) -VADDV_IMPL(uint32_t, uint32x2_t, vaddv, u32) -VADDV_IMPL(int32_t, int32x2_t, vaddv, s32) -VADDV_IMPL(float, float32x2_t, vaddv, f32) - -VADDV_IMPL(uint8_t, uint8x16_t, vaddvq, u8) -VADDV_IMPL(int8_t, int8x16_t, vaddvq, s8) -VADDV_IMPL(uint16_t, uint16x8_t, vaddvq, u16) -VADDV_IMPL(int16_t, int16x8_t, vaddvq, s16) -VADDV_IMPL(uint32_t, uint32x4_t, vaddvq, u32) -VADDV_IMPL(int32_t, int32x4_t, vaddvq, s32) -VADDV_IMPL(uint64_t, uint64x2_t, vaddvq, u64) -VADDV_IMPL(int64_t, int64x2_t, vaddvq, s64) -VADDV_IMPL(float, float32x4_t, vaddvq, f32) -#undef VADDV_IMPL -#endif // defined(__aarch64__) - -// VPADDL: Signed add long pairwise -#define VPADDL_IMPL(ltype, vtype, prefix, postfix) \ - inline ltype vpaddl(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -VPADDL_IMPL(uint16x4_t, uint8x8_t, vpaddl, u8) -VPADDL_IMPL(int16x4_t, int8x8_t, vpaddl, s8) -VPADDL_IMPL(uint32x2_t, uint16x4_t, vpaddl, u16) -VPADDL_IMPL(int32x2_t, int16x4_t, vpaddl, s16) -VPADDL_IMPL(uint64x1_t, uint32x2_t, vpaddl, u32) -VPADDL_IMPL(int64x1_t, int32x2_t, vpaddl, s32) - -VPADDL_IMPL(uint16x8_t, uint8x16_t, vpaddlq, u8) -VPADDL_IMPL(int16x8_t, int8x16_t, vpaddlq, s8) -VPADDL_IMPL(uint32x4_t, uint16x8_t, vpaddlq, u16) -VPADDL_IMPL(int32x4_t, int16x8_t, vpaddlq, s16) -VPADDL_IMPL(uint64x2_t, uint32x4_t, vpaddlq, u32) -VPADDL_IMPL(int64x2_t, int32x4_t, vpaddlq, s32) -#undef VPADDL_IMPL - -// VPADD: Add pairwise -#define VPADD_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vpadd(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VPADD_IMPL(uint8x8_t, uint8x8_t, vpadd, u8) -VPADD_IMPL(int8x8_t, int8x8_t, vpadd, s8) -VPADD_IMPL(uint16x4_t, uint16x4_t, vpadd, u16) -VPADD_IMPL(int16x4_t, int16x4_t, vpadd, s16) -VPADD_IMPL(uint32x2_t, uint32x2_t, vpadd, u32) -VPADD_IMPL(int32x2_t, int32x2_t, vpadd, s32) -VPADD_IMPL(float32x2_t, float32x2_t, vpadd, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VPADD_IMPL(float16x4_t, float16x4_t, vpadd, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VPADD_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_ADD_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/and.h b/arm_compute/core/NEON/wrapper/intrinsics/and.h deleted file mode 100644 index 8fffe35b8c..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/and.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_AND_H -#define ARM_COMPUTE_WRAPPER_AND_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VAND_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vand(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VAND_IMPL(uint8_t, uint8x8_t, vand, u8) -VAND_IMPL(int8_t, int8x8_t, vand, s8) -VAND_IMPL(uint16_t, uint16x4_t, vand, u16) -VAND_IMPL(int16_t, int16x4_t, vand, s16) -VAND_IMPL(uint32_t, uint32x2_t, vand, u32) -VAND_IMPL(int32_t, int32x2_t, vand, s32) -VAND_IMPL(uint64_t, uint64x1_t, vand, u64) -VAND_IMPL(int64_t, int64x1_t, vand, s64) - -VAND_IMPL(uint8_t, uint8x16_t, vandq, u8) -VAND_IMPL(int8_t, int8x16_t, vandq, s8) -VAND_IMPL(uint16_t, uint16x8_t, vandq, u16) -VAND_IMPL(int16_t, int16x8_t, vandq, s16) -VAND_IMPL(uint32_t, uint32x4_t, vandq, u32) -VAND_IMPL(int32_t, int32x4_t, vandq, s32) -VAND_IMPL(uint64_t, uint64x2_t, vandq, u64) -VAND_IMPL(int64_t, int64x2_t, vandq, s64) - -#undef VAND_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_AND_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h deleted file mode 100644 index 6d01b8a685..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_BSL_H -#define ARM_COMPUTE_WRAPPER_BSL_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VBSL_IMPL(stype, vtype, ctype, prefix, postfix) \ - inline vtype vbsl(const ctype &a, const vtype &b, const vtype &c) \ - { \ - return prefix##_##postfix(a, b, c); \ - } - -VBSL_IMPL(uint8_t, uint8x8_t, uint8x8_t, vbsl, u8) -VBSL_IMPL(int8_t, int8x8_t, uint8x8_t, vbsl, s8) -VBSL_IMPL(uint16_t, uint16x4_t, uint16x4_t, vbsl, u16) -VBSL_IMPL(int16_t, int16x4_t, uint16x4_t, vbsl, s16) -VBSL_IMPL(uint32_t, uint32x2_t, uint32x2_t, vbsl, u32) -VBSL_IMPL(int32_t, int32x2_t, uint32x2_t, vbsl, s32) -VBSL_IMPL(float32x2_t, float32x2_t, uint32x2_t, vbsl, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VBSL_IMPL(float16x4_t, float16x4_t, uint16x4_t, vbsl, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VBSL_IMPL(uint8_t, uint8x16_t, uint8x16_t, vbslq, u8) -VBSL_IMPL(int8_t, int8x16_t, uint8x16_t, vbslq, s8) -VBSL_IMPL(uint16_t, uint16x8_t, uint16x8_t, vbslq, u16) -VBSL_IMPL(int16_t, int16x8_t, uint16x8_t, vbslq, s16) -VBSL_IMPL(uint32_t, uint32x4_t, uint32x4_t, vbslq, u32) -VBSL_IMPL(int32_t, int32x4_t, uint32x4_t, vbslq, s32) -VBSL_IMPL(float32x4_t, float32x4_t, uint32x4_t, vbslq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VBSL_IMPL(float16x8_t, float16x8_t, uint16x8_t, vbslq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VBSL_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_BSL_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h b/arm_compute/core/NEON/wrapper/intrinsics/ceq.h deleted file mode 100644 index a84984d190..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_CEQ_H -#define ARM_COMPUTE_WRAPPER_CEQ_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VCEQ_IMPL(votype, vtype, prefix, postfix) \ - inline votype vceq(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VCEQ_IMPL(uint8x8_t, uint8x8_t, vceq, u8) -VCEQ_IMPL(uint8x8_t, int8x8_t, vceq, s8) -VCEQ_IMPL(uint16x4_t, uint16x4_t, vceq, u16) -VCEQ_IMPL(uint16x4_t, int16x4_t, vceq, s16) -VCEQ_IMPL(uint32x2_t, uint32x2_t, vceq, u32) -VCEQ_IMPL(uint32x2_t, int32x2_t, vceq, s32) -VCEQ_IMPL(uint32x2_t, float32x2_t, vceq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCEQ_IMPL(uint16x4_t, float16x4_t, vceq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VCEQ_IMPL(uint8x16_t, uint8x16_t, vceqq, u8) -VCEQ_IMPL(uint8x16_t, int8x16_t, vceqq, s8) -VCEQ_IMPL(uint16x8_t, uint16x8_t, vceqq, u16) -VCEQ_IMPL(uint16x8_t, int16x8_t, vceqq, s16) -VCEQ_IMPL(uint32x4_t, uint32x4_t, vceqq, u32) -VCEQ_IMPL(uint32x4_t, int32x4_t, vceqq, s32) -VCEQ_IMPL(uint32x4_t, float32x4_t, vceqq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCEQ_IMPL(uint16x8_t, float16x8_t, vceqq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VCEQ_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_CEQ_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cge.h b/arm_compute/core/NEON/wrapper/intrinsics/cge.h deleted file mode 100644 index ac2973bed4..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/cge.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_CGE_H -#define ARM_COMPUTE_WRAPPER_CGE_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VCGE_IMPL(stype, vtype, rtype, prefix, postfix) \ - inline rtype vcge(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VCGE_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcge, u8) -VCGE_IMPL(int8_t, int8x8_t, uint8x8_t, vcge, s8) -VCGE_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcge, u16) -VCGE_IMPL(int16_t, int16x4_t, uint16x4_t, vcge, s16) -VCGE_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcge, u32) -VCGE_IMPL(int32_t, int32x2_t, uint32x2_t, vcge, s32) -VCGE_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcge, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCGE_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcge, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VCGE_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcgeq, u8) -VCGE_IMPL(int8_t, int8x16_t, uint8x16_t, vcgeq, s8) -VCGE_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcgeq, u16) -VCGE_IMPL(int16_t, int16x8_t, uint16x8_t, vcgeq, s16) -VCGE_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcgeq, u32) -VCGE_IMPL(int32_t, int32x4_t, uint32x4_t, vcgeq, s32) -VCGE_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcgeq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCGE_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcgeq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VCGE_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_CGE_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h deleted file mode 100644 index c7ae2caefe..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_CGT_H -#define ARM_COMPUTE_WRAPPER_CGT_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VCGT_IMPL(rtype, vtype, prefix, postfix) \ - inline rtype vcgt(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VCGT_IMPL(uint8x8_t, uint8x8_t, vcgt, u8) -VCGT_IMPL(uint8x8_t, int8x8_t, vcgt, s8) -VCGT_IMPL(uint16x4_t, uint16x4_t, vcgt, u16) -VCGT_IMPL(uint16x4_t, int16x4_t, vcgt, s16) -VCGT_IMPL(uint32x2_t, uint32x2_t, vcgt, u32) -VCGT_IMPL(uint32x2_t, int32x2_t, vcgt, s32) -VCGT_IMPL(uint32x2_t, float32x2_t, vcgt, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCGT_IMPL(uint16x4_t, float16x4_t, vcgt, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VCGT_IMPL(uint8x16_t, uint8x16_t, vcgtq, u8) -VCGT_IMPL(uint8x16_t, int8x16_t, vcgtq, s8) -VCGT_IMPL(uint16x8_t, uint16x8_t, vcgtq, u16) -VCGT_IMPL(uint16x8_t, int16x8_t, vcgtq, s16) -VCGT_IMPL(uint32x4_t, uint32x4_t, vcgtq, u32) -VCGT_IMPL(uint32x4_t, int32x4_t, vcgtq, s32) -VCGT_IMPL(uint32x4_t, float32x4_t, vcgtq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCGT_IMPL(uint16x8_t, float16x8_t, vcgtq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VCGT_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_CGT_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cle.h b/arm_compute/core/NEON/wrapper/intrinsics/cle.h deleted file mode 100644 index 50c175f0c8..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/cle.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_CLE_H -#define ARM_COMPUTE_WRAPPER_CLE_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VCLE_IMPL(stype, vtype, rtype, prefix, postfix) \ - inline rtype vcle(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VCLE_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcle, u8) -VCLE_IMPL(int8_t, int8x8_t, uint8x8_t, vcle, s8) -VCLE_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcle, u16) -VCLE_IMPL(int16_t, int16x4_t, uint16x4_t, vcle, s16) -VCLE_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcle, u32) -VCLE_IMPL(int32_t, int32x2_t, uint32x2_t, vcle, s32) -VCLE_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcle, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCLE_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcle, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VCLE_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcleq, u8) -VCLE_IMPL(int8_t, int8x16_t, uint8x16_t, vcleq, s8) -VCLE_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcleq, u16) -VCLE_IMPL(int16_t, int16x8_t, uint16x8_t, vcleq, s16) -VCLE_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcleq, u32) -VCLE_IMPL(int32_t, int32x4_t, uint32x4_t, vcleq, s32) -VCLE_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcleq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCLE_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcleq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VCLE_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_CLE_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/clt.h b/arm_compute/core/NEON/wrapper/intrinsics/clt.h deleted file mode 100644 index 2d1ea2863e..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/clt.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_CLT_H -#define ARM_COMPUTE_WRAPPER_CLT_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VCLT_IMPL(votype, vtype, prefix, postfix) \ - inline votype vclt(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VCLT_IMPL(uint8x8_t, uint8x8_t, vclt, u8) -VCLT_IMPL(uint8x8_t, int8x8_t, vclt, s8) -VCLT_IMPL(uint16x4_t, uint16x4_t, vclt, u16) -VCLT_IMPL(uint16x4_t, int16x4_t, vclt, s16) -VCLT_IMPL(uint32x2_t, uint32x2_t, vclt, u32) -VCLT_IMPL(uint32x2_t, int32x2_t, vclt, s32) -VCLT_IMPL(uint32x2_t, float32x2_t, vclt, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCLT_IMPL(uint16x4_t, float16x4_t, vclt, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VCLT_IMPL(uint8x16_t, uint8x16_t, vcltq, u8) -VCLT_IMPL(uint8x16_t, int8x16_t, vcltq, s8) -VCLT_IMPL(uint16x8_t, uint16x8_t, vcltq, u16) -VCLT_IMPL(uint16x8_t, int16x8_t, vcltq, s16) -VCLT_IMPL(uint32x4_t, uint32x4_t, vcltq, u32) -VCLT_IMPL(uint32x4_t, int32x4_t, vcltq, s32) -VCLT_IMPL(uint32x4_t, float32x4_t, vcltq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCLT_IMPL(uint16x8_t, float16x8_t, vcltq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VCLT_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_CLT_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/combine.h b/arm_compute/core/NEON/wrapper/intrinsics/combine.h deleted file mode 100644 index c9d5bf8d90..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/combine.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_COMBINE_H -#define ARM_COMPUTE_WRAPPER_COMBINE_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VCOMBINE_IMPL(rtype, vtype, prefix, postfix) \ - inline rtype vcombine(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VCOMBINE_IMPL(uint8x16_t, uint8x8_t, vcombine, u8) -VCOMBINE_IMPL(int8x16_t, int8x8_t, vcombine, s8) -VCOMBINE_IMPL(uint16x8_t, uint16x4_t, vcombine, u16) -VCOMBINE_IMPL(int16x8_t, int16x4_t, vcombine, s16) -VCOMBINE_IMPL(uint32x4_t, uint32x2_t, vcombine, u32) -VCOMBINE_IMPL(int32x4_t, int32x2_t, vcombine, s32) -VCOMBINE_IMPL(float32x4_t, float32x2_t, vcombine, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCOMBINE_IMPL(float16x8_t, float16x4_t, vcombine, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VCOMBINE_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_COMBINE_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h b/arm_compute/core/NEON/wrapper/intrinsics/cvt.h deleted file mode 100644 index 6e79a92bc2..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_CVT_H -#define ARM_COMPUTE_WRAPPER_CVT_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ - template \ - inline typename std::enable_if::value, float32x4_t>::type \ - vcvt(const vtype &a) \ - { \ - return prefix##_##postfix1##_##postfix2(a); \ - } - -VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32) -VCVT_TO_F32_IMPL(float32x4_t, int32x4_t, vcvtq, f32, s32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#undef VCVT_TO_F32_IMPL - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ - template \ - inline typename std::enable_if::value, float16x4_t>::type \ - vcvt(const vtype &a) \ - { \ - return prefix##_##postfix1##_##postfix2(a); \ - } - -VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32) -#undef VCVT_TO_F16_IMPL -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template -inline typename std::enable_if::value, uint32x4_t>::type -vcvt(const float32x4_t &a) -{ - return vcvtq_u32_f32(a); -} - -template -inline typename std::enable_if::value, int32x4_t>::type -vcvt(const float32x4_t &a) -{ - return vcvtq_s32_f32(a); -} - -#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) -/** Convert 2x128-bit floating point vectors into 1x128-bit bfloat16 vector - * - * @param[in] inptr Pointer to the input memory to load values from - * @param[in,out] outptr Pointer to the output memory to store values to - */ -inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr) -{ - __asm __volatile( - "ldp q0, q1, [%[inptr]]\n" - ".inst 0xea16800\n" // BFCVTN v0, v0 - ".inst 0x4ea16820\n" // BFCVTN2 v0, v1 - "str q0, [%[outptr]]\n" - : [inptr] "+r"(inptr) - : [outptr] "r"(outptr) - : "v0", "v1", "memory"); -} -#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ - -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_CVT_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/div.h b/arm_compute/core/NEON/wrapper/intrinsics/div.h deleted file mode 100644 index 5731aba469..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/div.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_DIV_H -#define ARM_COMPUTE_WRAPPER_DIV_H - -#include "arm_compute/core/NEON/NEMath.h" -#include - -namespace arm_compute -{ -namespace wrapper -{ -#ifdef __aarch64__ - -#define VDIV_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vdiv(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } -VDIV_IMPL(float32x2_t, float32x2_t, vdiv, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VDIV_IMPL(float16x4_t, float16x4_t, vdiv, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VDIV_IMPL(float32x4_t, float32x4_t, vdivq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VDIV_IMPL(float16x8_t, float16x8_t, vdivq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#else // __aarch64__ - -#define VDIV_IMPL(stype, vtype, mul_prefix, inv_prefix, postfix) \ - inline vtype vdiv(const vtype &a, const vtype &b) \ - { \ - return mul_prefix##_##postfix(a, inv_prefix##_##postfix(b)); \ - } -VDIV_IMPL(float32x2_t, float32x2_t, vmul, vinv, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VDIV_IMPL(float16x4_t, float16x4_t, vmul, vinv, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VDIV_IMPL(float32x4_t, float32x4_t, vmulq, vinvq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VDIV_IMPL(float16x8_t, float16x8_t, vmulq, vinvq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#endif // __aarch64__ - -#undef VDIV_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_DIV_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h b/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h deleted file mode 100644 index 80d4c4074f..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_DUP_N_H -#define ARM_COMPUTE_WRAPPER_DUP_N_H - -#include "arm_compute/core/NEON/wrapper/traits.h" - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VDUP_N_IMPL(stype, vtype, prefix, postfix, tag) \ - inline vtype vdup_n(stype value, tag) \ - { \ - return prefix##_##postfix(value); \ - } - -VDUP_N_IMPL(uint8_t, uint8x8_t, vdup_n, u8, traits::vector_64_tag) -VDUP_N_IMPL(int8_t, int8x8_t, vdup_n, s8, traits::vector_64_tag) -VDUP_N_IMPL(uint16_t, uint16x4_t, vdup_n, u16, traits::vector_64_tag) -VDUP_N_IMPL(int16_t, int16x4_t, vdup_n, s16, traits::vector_64_tag) -VDUP_N_IMPL(uint32_t, uint32x2_t, vdup_n, u32, traits::vector_64_tag) -VDUP_N_IMPL(int32_t, int32x2_t, vdup_n, s32, traits::vector_64_tag) -VDUP_N_IMPL(float, float32x2_t, vdup_n, f32, traits::vector_64_tag) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VDUP_N_IMPL(float16_t, float16x4_t, vdup_n, f16, traits::vector_64_tag) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VDUP_N_IMPL(uint8_t, uint8x16_t, vdupq_n, u8, traits::vector_128_tag) -VDUP_N_IMPL(int8_t, int8x16_t, vdupq_n, s8, traits::vector_128_tag) -VDUP_N_IMPL(uint16_t, uint16x8_t, vdupq_n, u16, traits::vector_128_tag) -VDUP_N_IMPL(int16_t, int16x8_t, vdupq_n, s16, traits::vector_128_tag) -VDUP_N_IMPL(uint32_t, uint32x4_t, vdupq_n, u32, traits::vector_128_tag) -VDUP_N_IMPL(int32_t, int32x4_t, vdupq_n, s32, traits::vector_128_tag) -VDUP_N_IMPL(float, float32x4_t, vdupq_n, f32, traits::vector_128_tag) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VDUP_N_IMPL(float16_t, float16x8_t, vdupq_n, f16, traits::vector_128_tag) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VDUP_N_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_DUP_N_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/eor.h b/arm_compute/core/NEON/wrapper/intrinsics/eor.h deleted file mode 100644 index 227a743c3d..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/eor.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_EOR_H -#define ARM_COMPUTE_WRAPPER_EOR_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VEOR_IMPL(vtype, prefix, postfix) \ - inline vtype veor(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VEOR_IMPL(uint8x8_t, veor, u8) -VEOR_IMPL(int8x8_t, veor, s8) -VEOR_IMPL(uint16x4_t, veor, u16) -VEOR_IMPL(int16x4_t, veor, s16) -VEOR_IMPL(uint32x2_t, veor, u32) -VEOR_IMPL(int32x2_t, veor, s32) - -VEOR_IMPL(uint8x16_t, veorq, u8) -VEOR_IMPL(int8x16_t, veorq, s8) -VEOR_IMPL(uint16x8_t, veorq, u16) -VEOR_IMPL(int16x8_t, veorq, s16) -VEOR_IMPL(uint32x4_t, veorq, u32) -VEOR_IMPL(int32x4_t, veorq, s32) - -#undef VEOR_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_EOR_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/exp.h b/arm_compute/core/NEON/wrapper/intrinsics/exp.h deleted file mode 100644 index d50824b132..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/exp.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_EXP_H -#define ARM_COMPUTE_WRAPPER_EXP_H - -#include "arm_compute/core/NEON/NEMath.h" -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VEXPQ_IMPL(vtype, postfix) \ - inline vtype vexpq(const vtype &a) \ - { \ - return vexpq_##postfix(a); \ - } - -#define VEXPQ_IMPL_INT(vtype, postfix) \ - inline vtype vexpq(const vtype &a) \ - { \ - ARM_COMPUTE_UNUSED(a); \ - ARM_COMPUTE_ERROR("Not supported"); \ - } - -VEXPQ_IMPL(float32x4_t, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VEXPQ_IMPL(float16x8_t, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VEXPQ_IMPL_INT(int32x4_t, s32) -#undef VEXPQ_IMPL - -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_EXP_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ext.h b/arm_compute/core/NEON/wrapper/intrinsics/ext.h deleted file mode 100644 index d44b231bb2..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/ext.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_EXT_H -#define ARM_COMPUTE_WRAPPER_EXT_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VEXT_IMPL(vtype, prefix, postfix, size) \ - inline vtype vext_##size(vtype value_a, vtype value_b) \ - { \ - return prefix##_##postfix(value_a, value_b, size); \ - } - -VEXT_IMPL(uint8x8_t, vext, u8, 1) -VEXT_IMPL(uint8x8_t, vext, u8, 2) -VEXT_IMPL(int8x8_t, vext, s8, 1) -VEXT_IMPL(int8x8_t, vext, s8, 2) -VEXT_IMPL(uint16x4_t, vext, u16, 1) -VEXT_IMPL(uint16x4_t, vext, u16, 2) -VEXT_IMPL(int16x4_t, vext, s16, 1) -VEXT_IMPL(int16x4_t, vext, s16, 2) - -VEXT_IMPL(uint8x16_t, vextq, u8, 1) -VEXT_IMPL(uint8x16_t, vextq, u8, 2) -VEXT_IMPL(int8x16_t, vextq, s8, 1) -VEXT_IMPL(int8x16_t, vextq, s8, 2) -VEXT_IMPL(uint16x8_t, vextq, u16, 1) -VEXT_IMPL(uint16x8_t, vextq, u16, 2) -VEXT_IMPL(int16x8_t, vextq, s16, 1) -VEXT_IMPL(int16x8_t, vextq, s16, 2) -VEXT_IMPL(int32x4_t, vextq, s32, 1) -VEXT_IMPL(int32x4_t, vextq, s32, 2) - -#undef VEXT_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_EXT_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h b/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h deleted file mode 100644 index d98e129cd9..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_GET_HIGH_H -#define ARM_COMPUTE_WRAPPER_GET_HIGH_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VGETHIGH_IMPL(half_vtype, vtype, postfix) \ - inline half_vtype vgethigh(const vtype val) \ - { \ - return vget_high_##postfix(val); \ - } - -VGETHIGH_IMPL(uint8x8_t, uint8x16_t, u8) -VGETHIGH_IMPL(int8x8_t, int8x16_t, s8) -VGETHIGH_IMPL(uint16x4_t, uint16x8_t, u16) -VGETHIGH_IMPL(int16x4_t, int16x8_t, s16) -VGETHIGH_IMPL(uint32x2_t, uint32x4_t, u32) -VGETHIGH_IMPL(int32x2_t, int32x4_t, s32) -VGETHIGH_IMPL(float32x2_t, float32x4_t, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VGETHIGH_IMPL(float16x4_t, float16x8_t, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VGETHIGH_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_GET_HIGH_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlane.h b/arm_compute/core/NEON/wrapper/intrinsics/getlane.h deleted file mode 100644 index 2052751612..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/getlane.h +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_GET_LANE_H -#define ARM_COMPUTE_WRAPPER_GET_LANE_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VGETLANE_IMPL_8(stype, vtype, postfix) \ - inline stype vgetlane(const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vget_lane_##postfix(vector, 0); \ - case 1: \ - return vget_lane_##postfix(vector, 1); \ - case 2: \ - return vget_lane_##postfix(vector, 2); \ - case 3: \ - return vget_lane_##postfix(vector, 3); \ - case 4: \ - return vget_lane_##postfix(vector, 4); \ - case 5: \ - return vget_lane_##postfix(vector, 5); \ - case 6: \ - return vget_lane_##postfix(vector, 6); \ - case 7: \ - return vget_lane_##postfix(vector, 7); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -#define VGETLANE_IMPL_4(stype, vtype, postfix) \ - inline stype vgetlane(const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vget_lane_##postfix(vector, 0); \ - case 1: \ - return vget_lane_##postfix(vector, 1); \ - case 2: \ - return vget_lane_##postfix(vector, 2); \ - case 3: \ - return vget_lane_##postfix(vector, 3); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -#define VGETLANE_IMPL_2(stype, vtype, postfix) \ - inline stype vgetlane(const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vget_lane_##postfix(vector, 0); \ - case 1: \ - return vget_lane_##postfix(vector, 1); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -VGETLANE_IMPL_8(uint8_t, uint8x8_t, u8) -VGETLANE_IMPL_8(int8_t, int8x8_t, s8) -VGETLANE_IMPL_4(uint16_t, uint16x4_t, u16) -VGETLANE_IMPL_4(int16_t, int16x4_t, s16) -VGETLANE_IMPL_2(uint32_t, uint32x2_t, u32) -VGETLANE_IMPL_2(int32_t, int32x2_t, s32) -VGETLANE_IMPL_2(float, float32x2_t, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VGETLANE_IMPL_4(float16_t, float16x4_t, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#define VGETQLANE_IMPL_16(stype, vtype, postfix) \ - inline stype vgetlane(const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vgetq_lane_##postfix(vector, 0); \ - case 1: \ - return vgetq_lane_##postfix(vector, 1); \ - case 2: \ - return vgetq_lane_##postfix(vector, 2); \ - case 3: \ - return vgetq_lane_##postfix(vector, 3); \ - case 4: \ - return vgetq_lane_##postfix(vector, 4); \ - case 5: \ - return vgetq_lane_##postfix(vector, 5); \ - case 6: \ - return vgetq_lane_##postfix(vector, 6); \ - case 7: \ - return vgetq_lane_##postfix(vector, 7); \ - case 8: \ - return vgetq_lane_##postfix(vector, 8); \ - case 9: \ - return vgetq_lane_##postfix(vector, 9); \ - case 10: \ - return vgetq_lane_##postfix(vector, 10); \ - case 11: \ - return vgetq_lane_##postfix(vector, 11); \ - case 12: \ - return vgetq_lane_##postfix(vector, 12); \ - case 13: \ - return vgetq_lane_##postfix(vector, 13); \ - case 14: \ - return vgetq_lane_##postfix(vector, 14); \ - case 15: \ - return vgetq_lane_##postfix(vector, 15); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -#define VGETQLANE_IMPL_8(stype, vtype, postfix) \ - inline stype vgetlane(const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vgetq_lane_##postfix(vector, 0); \ - case 1: \ - return vgetq_lane_##postfix(vector, 1); \ - case 2: \ - return vgetq_lane_##postfix(vector, 2); \ - case 3: \ - return vgetq_lane_##postfix(vector, 3); \ - case 4: \ - return vgetq_lane_##postfix(vector, 4); \ - case 5: \ - return vgetq_lane_##postfix(vector, 5); \ - case 6: \ - return vgetq_lane_##postfix(vector, 6); \ - case 7: \ - return vgetq_lane_##postfix(vector, 7); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -#define VGETQLANE_IMPL_4(stype, vtype, postfix) \ - inline stype vgetlane(const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vgetq_lane_##postfix(vector, 0); \ - case 1: \ - return vgetq_lane_##postfix(vector, 1); \ - case 2: \ - return vgetq_lane_##postfix(vector, 2); \ - case 3: \ - return vgetq_lane_##postfix(vector, 3); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -#define VGETQLANE_IMPL_2(stype, vtype, postfix) \ - inline stype vgetlane(const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vgetq_lane_##postfix(vector, 0); \ - case 1: \ - return vgetq_lane_##postfix(vector, 1); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -VGETQLANE_IMPL_16(uint8_t, uint8x16_t, u8) -VGETQLANE_IMPL_16(int8_t, int8x16_t, s8) -VGETQLANE_IMPL_8(uint16_t, uint16x8_t, u16) -VGETQLANE_IMPL_8(int16_t, int16x8_t, s16) -VGETQLANE_IMPL_4(uint32_t, uint32x4_t, u32) -VGETQLANE_IMPL_4(int32_t, int32x4_t, s32) -VGETQLANE_IMPL_4(float, float32x4_t, f32) -VGETQLANE_IMPL_2(int64_t, int64x2_t, s64) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VGETQLANE_IMPL_8(float16_t, float16x8_t, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VGETLANE_IMPL_8 -#undef VGETLANE_IMPL_4 -#undef VGETLANE_IMPL_2 - -#undef VGETQLANE_IMPL_16 -#undef VGETQLANE_IMPL_8 -#undef VGETQLANE_IMPL_4 -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_GET_LANE_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h b/arm_compute/core/NEON/wrapper/intrinsics/getlow.h deleted file mode 100644 index b85b6cabf4..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_GET_LOW_H -#define ARM_COMPUTE_WRAPPER_GET_LOW_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VGETLOW_IMPL(half_vtype, vtype, postfix) \ - inline half_vtype vgetlow(const vtype val) \ - { \ - return vget_low_##postfix(val); \ - } - -VGETLOW_IMPL(uint8x8_t, uint8x16_t, u8) -VGETLOW_IMPL(int8x8_t, int8x16_t, s8) -VGETLOW_IMPL(uint16x4_t, uint16x8_t, u16) -VGETLOW_IMPL(int16x4_t, int16x8_t, s16) -VGETLOW_IMPL(uint32x2_t, uint32x4_t, u32) -VGETLOW_IMPL(int32x2_t, int32x4_t, s32) -VGETLOW_IMPL(float32x2_t, float32x4_t, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VGETLOW_IMPL(float16x4_t, float16x8_t, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VGETLOW_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_GET_LOW_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h deleted file mode 100644 index a30e723dc0..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_INTRINSICS_H -#define ARM_COMPUTE_WRAPPER_INTRINSICS_H - -#include "arm_compute/core/NEON/wrapper/intrinsics/abs.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/add.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/and.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/bsl.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/ceq.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/cge.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/cgt.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/cle.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/clt.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/combine.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/cvt.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/div.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/eor.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/exp.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/ext.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/getlow.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/inv.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/load.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/log.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/max.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/min.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/mla.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/movl.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/movn.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/mul.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/neg.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/not.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/orr.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/pow.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/qmov.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/qmovun.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/round.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/setlane.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/sin.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/store.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/sub.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/tanh.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/tbl.h" - -#endif /* ARM_COMPUTE_WRAPPER_INTRINSICS_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/inv.h b/arm_compute/core/NEON/wrapper/intrinsics/inv.h deleted file mode 100644 index 889d176670..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/inv.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_INV_H -#define ARM_COMPUTE_WRAPPER_INV_H - -#include "arm_compute/core/NEON/NEMath.h" -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VINV_IMPL(vtype, prefix, postfix) \ - inline vtype vinv(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -#define VINV_IMPL_INT(vtype, prefix, postfix) \ - inline vtype vinv(const vtype &a) \ - { \ - ARM_COMPUTE_UNUSED(a); \ - ARM_COMPUTE_ERROR("Not supported"); \ - } - -VINV_IMPL(float32x2_t, vinv, f32) -VINV_IMPL_INT(int32x2_t, vinv, s32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VINV_IMPL(float16x4_t, vinv, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VINV_IMPL(float32x4_t, vinvq, f32) -VINV_IMPL_INT(int32x4_t, vinvq, s32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VINV_IMPL(float16x8_t, vinvq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VINV_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_INV_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h b/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h deleted file mode 100644 index 8269afe1a2..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_INVSQRT_H -#define ARM_COMPUTE_WRAPPER_INVSQRT_H - -#include "arm_compute/core/NEON/NEMath.h" -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VINVSQRT_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vinvsqrt(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -#define VINVSQRT_IMPL_INT(stype, vtype, prefix, postfix) \ - inline vtype vinvsqrt(const vtype &a) \ - { \ - ARM_COMPUTE_UNUSED(a); \ - ARM_COMPUTE_ERROR("Not supported"); \ - } - -VINVSQRT_IMPL(float, float32x2_t, vinvsqrt, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VINVSQRT_IMPL(float16_t, float16x4_t, vinvsqrt, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VINVSQRT_IMPL_INT(int, int32x4_t, vinvsqrt, s32) - -VINVSQRT_IMPL(float, float32x4_t, vinvsqrtq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VINVSQRT_IMPL(float16_t, float16x8_t, vinvsqrtq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VINVSQRT_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_INVSQRT_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/load.h b/arm_compute/core/NEON/wrapper/intrinsics/load.h deleted file mode 100644 index 0fdf705d61..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/load.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_LOAD_H -#define ARM_COMPUTE_WRAPPER_LOAD_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VLOAD_IMPL(stype, vtype, postfix) \ - inline vtype vload(const stype *ptr) \ - { \ - return vld1_##postfix(ptr); \ - } - -VLOAD_IMPL(uint8_t, uint8x8_t, u8) -VLOAD_IMPL(int8_t, int8x8_t, s8) -VLOAD_IMPL(uint16_t, uint16x4_t, u16) -VLOAD_IMPL(int16_t, int16x4_t, s16) -VLOAD_IMPL(uint32_t, uint32x2_t, u32) -VLOAD_IMPL(int32_t, int32x2_t, s32) -//VLOAD_IMPL(uint64_t, uint64x1_t, u64) -//VLOAD_IMPL(int64_t, int64x1_t, s64) -VLOAD_IMPL(float, float32x2_t, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VLOAD_IMPL(float16_t, float16x4_t, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#define VLOADQ_IMPL(stype, vtype, postfix) \ - inline vtype vloadq(const stype *ptr) \ - { \ - return vld1q_##postfix(ptr); \ - } - -VLOADQ_IMPL(uint8_t, uint8x16_t, u8) -VLOADQ_IMPL(int8_t, int8x16_t, s8) -VLOADQ_IMPL(uint16_t, uint16x8_t, u16) -VLOADQ_IMPL(int16_t, int16x8_t, s16) -VLOADQ_IMPL(uint32_t, uint32x4_t, u32) -VLOADQ_IMPL(int32_t, int32x4_t, s32) -//VLOAD_IMPL(uint64_t, uint64x1_t, u64) -//VLOAD_IMPL(int64_t, int64x1_t, s64) -VLOADQ_IMPL(float, float32x4_t, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VLOADQ_IMPL(float16_t, float16x8_t, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#undef VLOAD_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_LOAD_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/log.h b/arm_compute/core/NEON/wrapper/intrinsics/log.h deleted file mode 100644 index 83de420f91..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/log.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_LOG_H -#define ARM_COMPUTE_WRAPPER_LOG_H - -#include "arm_compute/core/NEON/NEMath.h" -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VLOG_IMPL(vtype, prefix, postfix) \ - inline vtype vlog(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -#define VLOG_IMPL_INT(vtype, prefix, postfix) \ - inline vtype vlog(const vtype &a) \ - { \ - ARM_COMPUTE_UNUSED(a); \ - ARM_COMPUTE_ERROR("Not supported"); \ - } - -VLOG_IMPL(float32x4_t, vlogq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VLOG_IMPL(float16x8_t, vlogq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VLOG_IMPL_INT(int32x4_t, vlogq, s32) - -#undef VLOG_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_LOG_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/max.h b/arm_compute/core/NEON/wrapper/intrinsics/max.h deleted file mode 100644 index 7e52089b56..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/max.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_MAX_H -#define ARM_COMPUTE_WRAPPER_MAX_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VMAX_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vmax(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VMAX_IMPL(uint8_t, uint8x8_t, vmax, u8) -VMAX_IMPL(int8_t, int8x8_t, vmax, s8) -VMAX_IMPL(uint16_t, uint16x4_t, vmax, u16) -VMAX_IMPL(int16_t, int16x4_t, vmax, s16) -VMAX_IMPL(uint32_t, uint32x2_t, vmax, u32) -VMAX_IMPL(int32_t, int32x2_t, vmax, s32) -VMAX_IMPL(float, float32x2_t, vmax, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VMAX_IMPL(float16_t, float16x4_t, vmax, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VMAX_IMPL(uint8_t, uint8x16_t, vmaxq, u8) -VMAX_IMPL(int8_t, int8x16_t, vmaxq, s8) -VMAX_IMPL(uint16_t, uint16x8_t, vmaxq, u16) -VMAX_IMPL(int16_t, int16x8_t, vmaxq, s16) -VMAX_IMPL(uint32_t, uint32x4_t, vmaxq, u32) -VMAX_IMPL(int32_t, int32x4_t, vmaxq, s32) -VMAX_IMPL(float, float32x4_t, vmaxq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VMAX_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_MAX_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/min.h b/arm_compute/core/NEON/wrapper/intrinsics/min.h deleted file mode 100644 index b287598375..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/min.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_MIN_H -#define ARM_COMPUTE_WRAPPER_MIN_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VMIN_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vmin(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VMIN_IMPL(uint8_t, uint8x8_t, vmin, u8) -VMIN_IMPL(int8_t, int8x8_t, vmin, s8) -VMIN_IMPL(uint16_t, uint16x4_t, vmin, u16) -VMIN_IMPL(int16_t, int16x4_t, vmin, s16) -VMIN_IMPL(uint32_t, uint32x2_t, vmin, u32) -VMIN_IMPL(int32_t, int32x2_t, vmin, s32) -VMIN_IMPL(float, float32x2_t, vmin, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VMIN_IMPL(float16_t, float16x4_t, vmin, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VMIN_IMPL(uint8_t, uint8x16_t, vminq, u8) -VMIN_IMPL(int8_t, int8x16_t, vminq, s8) -VMIN_IMPL(uint16_t, uint16x8_t, vminq, u16) -VMIN_IMPL(int16_t, int16x8_t, vminq, s16) -VMIN_IMPL(uint32_t, uint32x4_t, vminq, u32) -VMIN_IMPL(int32_t, int32x4_t, vminq, s32) -VMIN_IMPL(float, float32x4_t, vminq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VMIN_IMPL(float16_t, float16x8_t, vminq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VMIN_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_MIN_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mla.h b/arm_compute/core/NEON/wrapper/intrinsics/mla.h deleted file mode 100644 index 2c89cfdcff..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/mla.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_MLA_H -#define ARM_COMPUTE_WRAPPER_MLA_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VMLA_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vmla(const vtype &a, const vtype &b, const vtype &c) \ - { \ - return prefix##_##postfix(a, b, c); \ - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#define VMLA_IMPL2(stype, vtype, prefix1, prefix2, postfix) \ - inline vtype vmla(const vtype &a, const vtype &b, const vtype &c) \ - { \ - return prefix1##_##postfix(a, prefix2##_##postfix(b, c)); \ - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VMLA_IMPL(uint8x8_t, uint8x8_t, vmla, u8) -VMLA_IMPL(int8x8_t, int8x8_t, vmla, s8) -VMLA_IMPL(uint16x4_t, uint16x4_t, vmla, u16) -VMLA_IMPL(int16x4_t, int16x4_t, vmla, s16) -VMLA_IMPL(uint32x2_t, uint32x2_t, vmla, u32) -VMLA_IMPL(int32x2_t, int32x2_t, vmla, s32) -VMLA_IMPL(float32x2_t, float32x2_t, vmla, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VMLA_IMPL2(float16x4_t, float16x4_t, vadd, vmul, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VMLA_IMPL(uint8x16_t, uint8x16_t, vmlaq, u8) -VMLA_IMPL(int8x16_t, int8x16_t, vmlaq, s8) -VMLA_IMPL(uint16x8_t, uint16x8_t, vmlaq, u16) -VMLA_IMPL(int16x8_t, int16x8_t, vmlaq, s16) -VMLA_IMPL(uint32x4_t, uint32x4_t, vmlaq, u32) -VMLA_IMPL(int32x4_t, int32x4_t, vmlaq, s32) -VMLA_IMPL(float32x4_t, float32x4_t, vmlaq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VMLA_IMPL2(float16x8_t, float16x8_t, vaddq, vmulq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VMLA_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_MLA_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movl.h b/arm_compute/core/NEON/wrapper/intrinsics/movl.h deleted file mode 100644 index fd97a44841..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/movl.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_MOVL_H -#define ARM_COMPUTE_WRAPPER_MOVL_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VMOVL_IMPL(ptype, vtype, prefix, postfix) \ - inline ptype vmovl(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -VMOVL_IMPL(uint16x8_t, uint8x8_t, vmovl, u8) -VMOVL_IMPL(int16x8_t, int8x8_t, vmovl, s8) -VMOVL_IMPL(uint32x4_t, uint16x4_t, vmovl, u16) -VMOVL_IMPL(int32x4_t, int16x4_t, vmovl, s16) -VMOVL_IMPL(uint64x2_t, uint32x2_t, vmovl, u32) -VMOVL_IMPL(int64x2_t, int32x2_t, vmovl, s32) - -#undef VMOVL_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_MOVL_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movn.h b/arm_compute/core/NEON/wrapper/intrinsics/movn.h deleted file mode 100644 index ed3b159fa2..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/movn.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_MOVN_H -#define ARM_COMPUTE_WRAPPER_MOVN_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VMOVN_IMPL(dtype, vtype, prefix, postfix) \ - inline dtype vmovn(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -VMOVN_IMPL(uint32x2_t, uint64x2_t, vmovn, u64) -VMOVN_IMPL(int32x2_t, int64x2_t, vmovn, s64) -VMOVN_IMPL(uint16x4_t, uint32x4_t, vmovn, u32) -VMOVN_IMPL(int16x4_t, int32x4_t, vmovn, s32) -VMOVN_IMPL(uint8x8_t, uint16x8_t, vmovn, u16) -VMOVN_IMPL(int8x8_t, int16x8_t, vmovn, s16) - -#define VQMOVN_IMPL(dtype, vtype, prefix, postfix) \ - inline dtype vqmovn(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -VQMOVN_IMPL(uint32x2_t, uint64x2_t, vqmovn, u64) -VQMOVN_IMPL(int32x2_t, int64x2_t, vqmovn, s64) -VQMOVN_IMPL(uint16x4_t, uint32x4_t, vqmovn, u32) -VQMOVN_IMPL(int16x4_t, int32x4_t, vqmovn, s32) -VQMOVN_IMPL(uint8x8_t, uint16x8_t, vqmovn, u16) -VQMOVN_IMPL(int8x8_t, int16x8_t, vqmovn, s16) - -#undef VMOVN_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_MOVN_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mul.h b/arm_compute/core/NEON/wrapper/intrinsics/mul.h deleted file mode 100644 index 88ea87aeef..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/mul.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_MUL_H -#define ARM_COMPUTE_WRAPPER_MUL_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VMUL_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vmul(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VMUL_IMPL(uint8x8_t, uint8x8_t, vmul, u8) -VMUL_IMPL(int8x8_t, int8x8_t, vmul, s8) -VMUL_IMPL(uint16x4_t, uint16x4_t, vmul, u16) -VMUL_IMPL(int16x4_t, int16x4_t, vmul, s16) -VMUL_IMPL(uint32x2_t, uint32x2_t, vmul, u32) -VMUL_IMPL(int32x2_t, int32x2_t, vmul, s32) -VMUL_IMPL(float32x2_t, float32x2_t, vmul, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VMUL_IMPL(float16_t, float16x4_t, vmul, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VMUL_IMPL(uint8_t, uint8x16_t, vmulq, u8) -VMUL_IMPL(int8_t, int8x16_t, vmulq, s8) -VMUL_IMPL(uint16_t, uint16x8_t, vmulq, u16) -VMUL_IMPL(int16_t, int16x8_t, vmulq, s16) -VMUL_IMPL(uint32_t, uint32x4_t, vmulq, u32) -VMUL_IMPL(int32_t, int32x4_t, vmulq, s32) -VMUL_IMPL(float32x4_t, float32x4_t, vmulq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VMUL_IMPL(float16_t, float16x8_t, vmulq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VMUL_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_MUL_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/neg.h b/arm_compute/core/NEON/wrapper/intrinsics/neg.h deleted file mode 100644 index c0c73dcaaf..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/neg.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_NEG_H -#define ARM_COMPUTE_WRAPPER_NEG_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VNEG_IMPL(vtype, prefix, postfix) \ - inline vtype vneg(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -VNEG_IMPL(int8x8_t, vneg, s8) -VNEG_IMPL(int16x4_t, vneg, s16) -VNEG_IMPL(int32x2_t, vneg, s32) -VNEG_IMPL(float32x2_t, vneg, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VNEG_IMPL(float16x4_t, vneg, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VNEG_IMPL(int8x16_t, vnegq, s8) -VNEG_IMPL(int16x8_t, vnegq, s16) -VNEG_IMPL(int32x4_t, vnegq, s32) -VNEG_IMPL(float32x4_t, vnegq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VNEG_IMPL(float16x8_t, vnegq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VNEG_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_NEG_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/not.h b/arm_compute/core/NEON/wrapper/intrinsics/not.h deleted file mode 100644 index 084b2a4944..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/not.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_NOT_H -#define ARM_COMPUTE_WRAPPER_NOT_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VNOT_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vnot(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -VNOT_IMPL(uint8_t, uint8x8_t, vmvn, u8) -VNOT_IMPL(int8_t, int8x8_t, vmvn, s8) -VNOT_IMPL(uint16_t, uint16x4_t, vmvn, u16) -VNOT_IMPL(int16_t, int16x4_t, vmvn, s16) -VNOT_IMPL(uint32_t, uint32x2_t, vmvn, u32) -VNOT_IMPL(int32_t, int32x2_t, vmvn, s32) -VNOT_IMPL(float32x2_t, float32x2_t, vinv, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VNOT_IMPL(float16x4_t, float16x4_t, vinv, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VNOT_IMPL(uint8_t, uint8x16_t, vmvnq, u8) -VNOT_IMPL(int8_t, int8x16_t, vmvnq, s8) -VNOT_IMPL(uint16_t, uint16x8_t, vmvnq, u16) -VNOT_IMPL(int16_t, int16x8_t, vmvnq, s16) -VNOT_IMPL(uint32_t, uint32x4_t, vmvnq, u32) -VNOT_IMPL(int32_t, int32x4_t, vmvnq, s32) -VNOT_IMPL(float32x4_t, float32x4_t, vinvq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VNOT_IMPL(float16x8_t, float16x8_t, vinvq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VNOT_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_NOT_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/orr.h b/arm_compute/core/NEON/wrapper/intrinsics/orr.h deleted file mode 100644 index 13979fe539..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/orr.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_ORR_H -#define ARM_COMPUTE_WRAPPER_ORR_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VORR_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vorr(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VORR_IMPL(uint8_t, uint8x8_t, vorr, u8) -VORR_IMPL(int8_t, int8x8_t, vorr, s8) -VORR_IMPL(uint16_t, uint16x4_t, vorr, u16) -VORR_IMPL(int16_t, int16x4_t, vorr, s16) -VORR_IMPL(uint32_t, uint32x2_t, vorr, u32) -VORR_IMPL(int32_t, int32x2_t, vorr, s32) -VORR_IMPL(uint64_t, uint64x1_t, vorr, u64) -VORR_IMPL(int64_t, int64x1_t, vorr, s64) - -VORR_IMPL(uint8_t, uint8x16_t, vorrq, u8) -VORR_IMPL(int8_t, int8x16_t, vorrq, s8) -VORR_IMPL(uint16_t, uint16x8_t, vorrq, u16) -VORR_IMPL(int16_t, int16x8_t, vorrq, s16) -VORR_IMPL(uint32_t, uint32x4_t, vorrq, u32) -VORR_IMPL(int32_t, int32x4_t, vorrq, s32) -VORR_IMPL(uint64_t, uint64x2_t, vorrq, u64) -VORR_IMPL(int64_t, int64x2_t, vorrq, s64) - -#undef VORR_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_ORR_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h b/arm_compute/core/NEON/wrapper/intrinsics/pmax.h deleted file mode 100644 index ba8d9cc6c4..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_PMAX_H -#define ARM_COMPUTE_WRAPPER_PMAX_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VPMAX_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vpmax(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VPMAX_IMPL(uint8_t, uint8x8_t, vpmax, u8) -VPMAX_IMPL(int8_t, int8x8_t, vpmax, s8) -VPMAX_IMPL(uint16_t, uint16x4_t, vpmax, u16) -VPMAX_IMPL(int16_t, int16x4_t, vpmax, s16) -VPMAX_IMPL(uint32_t, uint32x2_t, vpmax, u32) -VPMAX_IMPL(int32_t, int32x2_t, vpmax, s32) -VPMAX_IMPL(float, float32x2_t, vpmax, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VPMAX_IMPL(float16_t, float16x4_t, vpmax, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VPMAX_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_PMAX_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h b/arm_compute/core/NEON/wrapper/intrinsics/pmin.h deleted file mode 100644 index 45e64a834a..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_PMIN_H -#define ARM_COMPUTE_WRAPPER_PMIN_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VPMIN_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vpmin(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VPMIN_IMPL(uint8_t, uint8x8_t, vpmin, u8) -VPMIN_IMPL(int8_t, int8x8_t, vpmin, s8) -VPMIN_IMPL(uint16_t, uint16x4_t, vpmin, u16) -VPMIN_IMPL(int16_t, int16x4_t, vpmin, s16) -VPMIN_IMPL(uint32_t, uint32x2_t, vpmin, u32) -VPMIN_IMPL(int32_t, int32x2_t, vpmin, s32) -VPMIN_IMPL(float, float32x2_t, vpmin, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VPMIN_IMPL(float16_t, float16x4_t, vpmin, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VPMIN_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_PMIN_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pow.h b/arm_compute/core/NEON/wrapper/intrinsics/pow.h deleted file mode 100644 index bffbc4f7b2..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/pow.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_POW_H -#define ARM_COMPUTE_WRAPPER_POW_H - -#include "arm_compute/core/NEON/NEMath.h" -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VPOW_IMPL(vtype, prefix, postfix) \ - inline vtype vpow(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VPOW_IMPL(float32x4_t, vpowq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VPOW_IMPL(float16x8_t, vpowq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VPOW_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_POW_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/qmov.h b/arm_compute/core/NEON/wrapper/intrinsics/qmov.h deleted file mode 100644 index 167f3cf43b..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/qmov.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_QMOV_H -#define ARM_COMPUTE_WRAPPER_QMOV_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -template -inline typename std::enable_if::value, uint8x8_t>::type -vqmov(const int16x8_t &a) -{ - return vqmovun_s16(a); -} - -template -inline typename std::enable_if::value, int8x8_t>::type -vqmov(const int16x8_t &a) -{ - return vqmovn_s16(a); -} - -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_QMOV_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h b/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h deleted file mode 100644 index f823ddb513..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_QMOVUN_H -#define ARM_COMPUTE_WRAPPER_QMOVUN_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VQMOVUN_IMPL(dtype, vtype, prefix, postfix) \ - inline dtype vqmovun(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -VQMOVUN_IMPL(uint32x2_t, int64x2_t, vqmovun, s64) -VQMOVUN_IMPL(uint16x4_t, int32x4_t, vqmovun, s32) -VQMOVUN_IMPL(uint8x8_t, int16x8_t, vqmovun, s16) - -#undef VQMOVUN_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_QMOVUN_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h b/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h deleted file mode 100644 index 0c26cd9008..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_REINTERPRET_H -#define ARM_COMPUTE_WRAPPER_REINTERPRET_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VREINTERPRET_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ - inline ptype vreinterpret(const vtype &a) \ - { \ - return prefix##_##postfix1##_##postfix2(a); \ - } \ - \ - inline ptype vreinterpret(const ptype &a) \ - { \ - return a; \ - } - -VREINTERPRET_IMPL(int16x4_t, uint16x4_t, vreinterpret, s16, u16) - -VREINTERPRET_IMPL(int32x4_t, uint32x4_t, vreinterpretq, s32, u32) -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_REINTERPRET_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/rev64.h b/arm_compute/core/NEON/wrapper/intrinsics/rev64.h deleted file mode 100644 index 1119c34654..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/rev64.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_REV64_H -#define ARM_COMPUTE_WRAPPER_REV64_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VREV64_IMPL(vtype, prefix, postfix) \ - inline vtype vrev64(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -VREV64_IMPL(uint8x8_t, vrev64, u8) -VREV64_IMPL(int8x8_t, vrev64, s8) -VREV64_IMPL(uint16x4_t, vrev64, u16) -VREV64_IMPL(int16x4_t, vrev64, s16) -VREV64_IMPL(uint32x2_t, vrev64, u32) -VREV64_IMPL(int32x2_t, vrev64, s32) -VREV64_IMPL(float32x2_t, vrev64, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VREV64_IMPL(float16x4_t, vrev64, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VREV64_IMPL(uint8x16_t, vrev64q, u8) -VREV64_IMPL(int8x16_t, vrev64q, s8) -VREV64_IMPL(uint16x8_t, vrev64q, u16) -VREV64_IMPL(int16x8_t, vrev64q, s16) -VREV64_IMPL(uint32x4_t, vrev64q, u32) -VREV64_IMPL(int32x4_t, vrev64q, s32) -VREV64_IMPL(float32x4_t, vrev64q, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VREV64_IMPL(float16x8_t, vrev64q, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VREV64_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_REV64_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/round.h b/arm_compute/core/NEON/wrapper/intrinsics/round.h deleted file mode 100644 index dd068ea709..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/round.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_ROUND_H -#define ARM_COMPUTE_WRAPPER_ROUND_H - -#include "arm_compute/core/NEON/NEMath.h" -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VROUNDQ_IMPL(vtype, postfix) \ - inline vtype vround(const vtype &a) \ - { \ - return vroundq_rte_##postfix(a); \ - } - -#define VROUNDQ_IMPL_INT(vtype, postfix) \ - inline vtype vround(const vtype &a) \ - { \ - ARM_COMPUTE_UNUSED(a); \ - ARM_COMPUTE_ERROR("Not supported"); \ - } - -VROUNDQ_IMPL(float32x4_t, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VROUNDQ_IMPL(float16x8_t, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VROUNDQ_IMPL_INT(int32x4_t, s32) -#undef VROUNDQ_IMPL - -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_ROUND_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/setlane.h b/arm_compute/core/NEON/wrapper/intrinsics/setlane.h deleted file mode 100644 index 197eedacb5..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/setlane.h +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_SET_LANE_H -#define ARM_COMPUTE_WRAPPER_SET_LANE_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VSETLANE_IMPL_8(stype, atype, vtype, postfix) \ - inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vset_lane_##postfix(value, vector, 0); \ - case 1: \ - return vset_lane_##postfix(value, vector, 1); \ - case 2: \ - return vset_lane_##postfix(value, vector, 2); \ - case 3: \ - return vset_lane_##postfix(value, vector, 3); \ - case 4: \ - return vset_lane_##postfix(value, vector, 4); \ - case 5: \ - return vset_lane_##postfix(value, vector, 5); \ - case 6: \ - return vset_lane_##postfix(value, vector, 6); \ - case 7: \ - return vset_lane_##postfix(value, vector, 7); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -#define VSETLANE_IMPL_4(stype, atype, vtype, postfix) \ - inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vset_lane_##postfix(value, vector, 0); \ - case 1: \ - return vset_lane_##postfix(value, vector, 1); \ - case 2: \ - return vset_lane_##postfix(value, vector, 2); \ - case 3: \ - return vset_lane_##postfix(value, vector, 3); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -#define VSETLANE_IMPL_2(stype, atype, vtype, postfix) \ - inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vset_lane_##postfix(value, vector, 0); \ - case 1: \ - return vset_lane_##postfix(value, vector, 1); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -VSETLANE_IMPL_8(uint8x8_t, uint8_t, uint8x8_t, u8) -VSETLANE_IMPL_8(int8x8_t, int8_t, int8x8_t, s8) -VSETLANE_IMPL_4(uint16x4_t, uint16_t, uint16x4_t, u16) -VSETLANE_IMPL_4(int16x4_t, int16_t, int16x4_t, s16) -VSETLANE_IMPL_2(uint32x2_t, uint32_t, uint32x2_t, u32) -VSETLANE_IMPL_2(int32x2_t, int32_t, int32x2_t, s32) -VSETLANE_IMPL_2(float32x2_t, float, float32x2_t, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#define VSETQLANE_IMPL_16(stype, atype, vtype, postfix) \ - inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vsetq_lane_##postfix(value, vector, 0); \ - case 1: \ - return vsetq_lane_##postfix(value, vector, 1); \ - case 2: \ - return vsetq_lane_##postfix(value, vector, 2); \ - case 3: \ - return vsetq_lane_##postfix(value, vector, 3); \ - case 4: \ - return vsetq_lane_##postfix(value, vector, 4); \ - case 5: \ - return vsetq_lane_##postfix(value, vector, 5); \ - case 6: \ - return vsetq_lane_##postfix(value, vector, 6); \ - case 7: \ - return vsetq_lane_##postfix(value, vector, 7); \ - case 8: \ - return vsetq_lane_##postfix(value, vector, 8); \ - case 9: \ - return vsetq_lane_##postfix(value, vector, 9); \ - case 10: \ - return vsetq_lane_##postfix(value, vector, 10); \ - case 11: \ - return vsetq_lane_##postfix(value, vector, 11); \ - case 12: \ - return vsetq_lane_##postfix(value, vector, 12); \ - case 13: \ - return vsetq_lane_##postfix(value, vector, 13); \ - case 14: \ - return vsetq_lane_##postfix(value, vector, 14); \ - case 15: \ - return vsetq_lane_##postfix(value, vector, 15); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -#define VSETQLANE_IMPL_8(stype, atype, vtype, postfix) \ - inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vsetq_lane_##postfix(value, vector, 0); \ - case 1: \ - return vsetq_lane_##postfix(value, vector, 1); \ - case 2: \ - return vsetq_lane_##postfix(value, vector, 2); \ - case 3: \ - return vsetq_lane_##postfix(value, vector, 3); \ - case 4: \ - return vsetq_lane_##postfix(value, vector, 4); \ - case 5: \ - return vsetq_lane_##postfix(value, vector, 5); \ - case 6: \ - return vsetq_lane_##postfix(value, vector, 6); \ - case 7: \ - return vsetq_lane_##postfix(value, vector, 7); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -#define VSETQLANE_IMPL_4(stype, atype, vtype, postfix) \ - inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ - { \ - switch(lane) \ - { \ - case 0: \ - return vsetq_lane_##postfix(value, vector, 0); \ - case 1: \ - return vsetq_lane_##postfix(value, vector, 1); \ - case 2: \ - return vsetq_lane_##postfix(value, vector, 2); \ - case 3: \ - return vsetq_lane_##postfix(value, vector, 3); \ - default: \ - ARM_COMPUTE_ERROR("Invalid lane"); \ - } \ - } - -VSETQLANE_IMPL_16(uint8x16_t, uint8_t, uint8x16_t, u8) -VSETQLANE_IMPL_16(int8x16_t, int8_t, int8x16_t, s8) -VSETQLANE_IMPL_8(uint16x8_t, uint16_t, uint16x8_t, u16) -VSETQLANE_IMPL_8(int16x8_t, int16_t, int16x8_t, s16) -VSETQLANE_IMPL_4(uint32x4_t, uint32_t, uint32x4_t, u32) -VSETQLANE_IMPL_4(int32x4_t, int32_t, int32x4_t, s32) -VSETQLANE_IMPL_4(float32x4_t, float, float32x4_t, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VSETQLANE_IMPL_8(float16x8_t, float16_t, float16x8_t, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VSETLANE_IMPL_8 -#undef VSETLANE_IMPL_4 -#undef VSETLANE_IMPL_2 - -#undef VSETQLANE_IMPL_16 -#undef VSETQLANE_IMPL_8 -#undef VSETQLANE_IMPL_4 -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_SET_LANE_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/sin.h b/arm_compute/core/NEON/wrapper/intrinsics/sin.h deleted file mode 100644 index 7c9cc468ed..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/sin.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_SIN_H -#define ARM_COMPUTE_WRAPPER_SIN_H - -#include "arm_compute/core/NEON/NEMath.h" -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VSIN_IMPL(vtype, prefix, postfix) \ - inline vtype vsin(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -#define VSIN_IMPL_INT(vtype, prefix, postfix) \ - inline vtype vsin(const vtype &a) \ - { \ - ARM_COMPUTE_UNUSED(a); \ - ARM_COMPUTE_ERROR("Not supported"); \ - } - -VSIN_IMPL(float32x4_t, vsinq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VSIN_IMPL(float16x8_t, vsinq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VSIN_IMPL_INT(int32x4_t, vsinq, s32) - -#undef vsub_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_SUB_H */ \ No newline at end of file diff --git a/arm_compute/core/NEON/wrapper/intrinsics/store.h b/arm_compute/core/NEON/wrapper/intrinsics/store.h deleted file mode 100644 index 6dda432ea9..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/store.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_STORE_H -#define ARM_COMPUTE_WRAPPER_STORE_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VSTORE_IMPL(stype, vtype, prefix, postfix) \ - inline void vstore(stype *ptr, vtype val) \ - { \ - prefix##_##postfix(ptr, val); \ - } - -VSTORE_IMPL(uint8_t, uint8x8_t, vst1, u8) -VSTORE_IMPL(uint8_t, uint8x8x2_t, vst2, u8) -VSTORE_IMPL(int8_t, int8x8_t, vst1, s8) -VSTORE_IMPL(int8_t, int8x8x2_t, vst2, s8) -VSTORE_IMPL(uint16_t, uint16x4_t, vst1, u16) -VSTORE_IMPL(int16_t, int16x4_t, vst1, s16) -VSTORE_IMPL(uint32_t, uint32x2_t, vst1, u32) -VSTORE_IMPL(int32_t, int32x2_t, vst1, s32) -//VSTORE_IMPL(uint64_t, 1, vst1, u64) -//VSTORE_IMPL(int64_t, 1, vst1, s64) -VSTORE_IMPL(float, float32x2_t, vst1, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VSTORE_IMPL(float16_t, float16x4_t, vst1, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VSTORE_IMPL(uint8_t, uint8x16_t, vst1q, u8) -VSTORE_IMPL(int8_t, int8x16_t, vst1q, s8) -VSTORE_IMPL(uint16_t, uint16x8_t, vst1q, u16) -VSTORE_IMPL(int16_t, int16x8_t, vst1q, s16) -VSTORE_IMPL(uint32_t, uint32x4_t, vst1q, u32) -VSTORE_IMPL(int32_t, int32x4_t, vst1q, s32) -//VSTORE_IMPL(uint64_t, 2, vst1q, u64) -//VSTORE_IMPL(int64_t, 2, vst1q, s64) -VSTORE_IMPL(float, float32x4_t, vst1q, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VSTORE_IMPL(float16_t, float16x8_t, vst1q, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VSTORE_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_STORE_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/sub.h b/arm_compute/core/NEON/wrapper/intrinsics/sub.h deleted file mode 100644 index 475986d0f6..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/sub.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_SUB_H -#define ARM_COMPUTE_WRAPPER_SUB_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VSUB_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vsub(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VSUB_IMPL(uint8x8_t, uint8x8_t, vsub, u8) -VSUB_IMPL(int8x8_t, int8x8_t, vsub, s8) -VSUB_IMPL(uint16x4_t, uint16x4_t, vsub, u16) -VSUB_IMPL(int16x4_t, int16x4_t, vsub, s16) -VSUB_IMPL(uint32x2_t, uint32x2_t, vsub, u32) -VSUB_IMPL(int32x2_t, int32x2_t, vsub, s32) -VSUB_IMPL(uint64x1_t, uint64x1_t, vsub, u64) -VSUB_IMPL(int64x1_t, int64x1_t, vsub, s64) -VSUB_IMPL(float32x2_t, float32x2_t, vsub, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VSUB_IMPL(float16x4_t, float16x4_t, vsub, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VSUB_IMPL(uint8x16_t, uint8x16_t, vsubq, u8) -VSUB_IMPL(int8x16_t, int8x16_t, vsubq, s8) -VSUB_IMPL(uint16x8_t, uint16x8_t, vsubq, u16) -VSUB_IMPL(int16x8_t, int16x8_t, vsubq, s16) -VSUB_IMPL(uint32x4_t, uint32x4_t, vsubq, u32) -VSUB_IMPL(int32x4_t, int32x4_t, vsubq, s32) -VSUB_IMPL(uint64x2_t, uint64x2_t, vsubq, u64) -VSUB_IMPL(int64x2_t, int64x2_t, vsubq, s64) -VSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VSUB_IMPL - -// VQSUB: Vector saturating sub (No notion of saturation for floating point) -#define VQSUB_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vqsub(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VQSUB_IMPL(uint8x8_t, uint8x8_t, vqsub, u8) -VQSUB_IMPL(int8x8_t, int8x8_t, vqsub, s8) -VQSUB_IMPL(uint16x4_t, uint16x4_t, vqsub, u16) -VQSUB_IMPL(int16x4_t, int16x4_t, vqsub, s16) -VQSUB_IMPL(uint32x2_t, uint32x2_t, vqsub, u32) -VQSUB_IMPL(int32x2_t, int32x2_t, vqsub, s32) -VQSUB_IMPL(uint64x1_t, uint64x1_t, vqsub, u64) -VQSUB_IMPL(int64x1_t, int64x1_t, vqsub, s64) -VQSUB_IMPL(float32x2_t, float32x2_t, vsub, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VQSUB_IMPL(float16x4_t, float16x4_t, vsub, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VQSUB_IMPL(uint8x16_t, uint8x16_t, vqsubq, u8) -VQSUB_IMPL(int8x16_t, int8x16_t, vqsubq, s8) -VQSUB_IMPL(uint16x8_t, uint16x8_t, vqsubq, u16) -VQSUB_IMPL(int16x8_t, int16x8_t, vqsubq, s16) -VQSUB_IMPL(uint32x4_t, uint32x4_t, vqsubq, u32) -VQSUB_IMPL(int32x4_t, int32x4_t, vqsubq, s32) -VQSUB_IMPL(uint64x2_t, uint64x2_t, vqsubq, u64) -VQSUB_IMPL(int64x2_t, int64x2_t, vqsubq, s64) -VQSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VQSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#undef VQSUB_IMPL - -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_SUB_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/tanh.h b/arm_compute/core/NEON/wrapper/intrinsics/tanh.h deleted file mode 100644 index 2943b9b1ea..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/tanh.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_TANH_H -#define ARM_COMPUTE_WRAPPER_TANH_H - -#include "arm_compute/core/NEON/NEMath.h" -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VTANH_IMPL(vtype, prefix, postfix) \ - inline vtype vtanh(const vtype &a) \ - { \ - return prefix##_##postfix(a); \ - } - -VTANH_IMPL(float32x4_t, vtanhq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VTANH_IMPL(float16x8_t, vtanhq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#undef VTANH_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_TANH_H */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/tbl.h b/arm_compute/core/NEON/wrapper/intrinsics/tbl.h deleted file mode 100644 index 05e6c1fc13..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/tbl.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_TBL_H -#define ARM_COMPUTE_WRAPPER_TBL_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VTBL_IMPL(stype, vtype, prefix, postfix) \ - inline vtype vtbl(const stype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VTBL_IMPL(uint8x8x2_t, uint8x8_t, vtbl2, u8) -VTBL_IMPL(int8x8x2_t, int8x8_t, vtbl2, s8) - -#undef VTBL_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_TBL_H */ diff --git a/arm_compute/core/NEON/wrapper/scalar/add.h b/arm_compute/core/NEON/wrapper/scalar/add.h deleted file mode 100644 index 642d9261f3..0000000000 --- a/arm_compute/core/NEON/wrapper/scalar/add.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_SCALAR_ADD_H -#define ARM_COMPUTE_WRAPPER_SCALAR_ADD_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -inline uint8_t add_sat(const uint8_t &a, const uint8_t &b) -{ - const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 }; - const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 }; - return vget_lane_u8(vqadd_u8(va, vb), 0); -} - -inline int16_t add_sat(const int16_t &a, const int16_t &b) -{ - const int16x4_t va = { a, 0, 0, 0 }; - const int16x4_t vb = { b, 0, 0, 0 }; - return vget_lane_s16(vqadd_s16(va, vb), 0); -} - -inline int32_t add_sat(const int32_t &a, const int32_t &b) -{ - const int32x2_t va = { a, 0 }; - const int32x2_t vb = { b, 0 }; - return vget_lane_s32(vqadd_s32(va, vb), 0); -} - -inline float add_sat(const float &a, const float &b) -{ - // No notion of saturation exists in floating point - return a + b; -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -inline float16_t add_sat(const float16_t &a, const float16_t &b) -{ - // No notion of saturation exists in floating point - return a + b; -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_SCALAR_ADD_H */ diff --git a/arm_compute/core/NEON/wrapper/scalar/scalar.h b/arm_compute/core/NEON/wrapper/scalar/scalar.h deleted file mode 100644 index 1bc50c2740..0000000000 --- a/arm_compute/core/NEON/wrapper/scalar/scalar.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_SCALAR_H -#define ARM_COMPUTE_WRAPPER_SCALAR_H - -#include "arm_compute/core/NEON/wrapper/scalar/add.h" -#include "arm_compute/core/NEON/wrapper/scalar/sub.h" - -#endif /* ARM_COMPUTE_WRAPPER_SCALAR_H */ diff --git a/arm_compute/core/NEON/wrapper/scalar/sub.h b/arm_compute/core/NEON/wrapper/scalar/sub.h deleted file mode 100644 index 1fe51d75fc..0000000000 --- a/arm_compute/core/NEON/wrapper/scalar/sub.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_SCALAR_SUB_H -#define ARM_COMPUTE_WRAPPER_SCALAR_SUB_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b) -{ - const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 }; - const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 }; - return vget_lane_u8(vqsub_u8(va, vb), 0); -} - -inline int16_t sub_sat(const int16_t &a, const int16_t &b) -{ - const int16x4_t va = { a, 0, 0, 0 }; - const int16x4_t vb = { b, 0, 0, 0 }; - return vget_lane_s16(vqsub_s16(va, vb), 0); -} - -inline int32_t sub_sat(const int32_t &a, const int32_t &b) -{ - const int32x2_t va = { a, 0 }; - const int32x2_t vb = { b, 0 }; - return vget_lane_s32(vqsub_s32(va, vb), 0); -} - -inline float sub_sat(const float &a, const float &b) -{ - // No notion of saturation exists in floating point - return a - b; -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -inline float16_t sub_sat(const float16_t &a, const float16_t &b) -{ - // No notion of saturation exists in floating point - return a - b; -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_SCALAR_SUB_H */ diff --git a/arm_compute/core/NEON/wrapper/traits.h b/arm_compute/core/NEON/wrapper/traits.h deleted file mode 100644 index eafbeef372..0000000000 --- a/arm_compute/core/NEON/wrapper/traits.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_TRAITS_H -#define ARM_COMPUTE_WRAPPER_TRAITS_H - -#include - -namespace arm_compute -{ -namespace wrapper -{ -namespace traits -{ -// *INDENT-OFF* -// clang-format off - -/** 64-bit vector tag */ -struct vector_64_tag {}; -/** 128-bit vector tag */ -struct vector_128_tag {}; - -/** Create the appropriate NEON vector given its type and size in terms of elements */ -template struct neon_vector; - -// Specializations -#ifndef DOXYGEN_SKIP_THIS -template <> struct neon_vector{ using scalar_type = uint8_t; using type = uint8x8_t; using tag_type = vector_64_tag; }; -template <> struct neon_vector{ using scalar_type = int8_t; using type = int8x8_t; using tag_type = vector_64_tag; }; -template <> struct neon_vector{ using scalar_type = uint8_t; using type = uint8x16_t; using tag_type = vector_128_tag; }; -template <> struct neon_vector{ using scalar_type = int8_t; using type = int8x16_t; using tag_type = vector_128_tag; }; -template <> struct neon_vector{ using scalar_type = uint16_t; using type = uint16x4_t; using tag_type = vector_64_tag; }; -template <> struct neon_vector{ using scalar_type = int16_t; using type = int16x4_t; using tag_type = vector_64_tag; }; -template <> struct neon_vector{ using scalar_type = uint16_t; using type = uint16x8_t; using tag_type = vector_128_tag; }; -template <> struct neon_vector{ using scalar_type = uint16_t; using type = uint16x8x2_t; }; -template <> struct neon_vector{ using scalar_type = int16_t; using type = int16x8_t; using tag_type = vector_128_tag; }; -template <> struct neon_vector{ using scalar_type = int16_t; using type = int16x8x2_t; }; -template <> struct neon_vector{ using scalar_type = uint32_t; using type = uint32x2_t; using tag_type = vector_64_tag; }; -template <> struct neon_vector{ using scalar_type = int32_t; using type = int32x2_t; using tag_type = vector_64_tag; }; -template <> struct neon_vector{ using scalar_type = uint32_t; using type = uint32x4_t; using tag_type = vector_128_tag; }; -template <> struct neon_vector{ using scalar_type = int32_t; using type = int32x4_t; using tag_type = vector_128_tag; }; -template <> struct neon_vector{ using scalar_type = uint64_t;using type = uint64x1_t; using tag_type = vector_64_tag; }; -template <> struct neon_vector{ using scalar_type = int64_t; using type = int64x1_t; using tag_type = vector_64_tag; }; -template <> struct neon_vector{ using scalar_type = uint64_t; using type = uint64x2_t; using tag_type = vector_128_tag; }; -template <> struct neon_vector{ using scalar_type = int64_t; using type = int64x2_t; using tag_type = vector_128_tag; }; -template <> struct neon_vector{ using scalar_type = float_t; using type = float32x2_t; using tag_type = vector_64_tag; }; -template <> struct neon_vector{ using scalar_type = float_t; using type = float32x4_t; using tag_type = vector_128_tag; }; - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> struct neon_vector{ using scalar_type = float16_t; using type = float16x4_t; using tag_type = vector_64_tag; }; -template <> struct neon_vector{ using scalar_type = float16_t; using type = float16x8_t; using tag_type = vector_128_tag; }; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#endif /* DOXYGEN_SKIP_THIS */ - -/** Helper type template to get the type of a neon vector */ -template using neon_vector_t = typename neon_vector::type; -/** Helper type template to get the tag type of a neon vector */ -template using neon_vector_tag_t = typename neon_vector::tag_type; - -/** Vector bit-width enum class */ -enum class BitWidth -{ - W64, /**< 64-bit width */ - W128, /**< 128-bit width */ -}; - -/** Create the appropriate NEON vector given its type and size in terms of bits */ -template struct neon_bitvector; -// Specializations -#ifndef DOXYGEN_SKIP_THIS -template <> struct neon_bitvector{ using type = uint8x8_t; using tag_type = vector_64_tag; }; -template <> struct neon_bitvector{ using type = int8x8_t; using tag_type = vector_64_tag; }; -template <> struct neon_bitvector{ using type = uint8x16_t; using tag_type = vector_128_tag; }; -template <> struct neon_bitvector{ using type = int8x16_t; using tag_type = vector_128_tag; }; -template <> struct neon_bitvector{ using type = uint16x4_t; using tag_type = vector_64_tag; }; -template <> struct neon_bitvector{ using type = int16x4_t; using tag_type = vector_64_tag; }; -template <> struct neon_bitvector{ using type = uint16x8_t; using tag_type = vector_128_tag; }; -template <> struct neon_bitvector{ using type = int16x8_t; using tag_type = vector_128_tag; }; -template <> struct neon_bitvector{ using type = uint32x2_t; using tag_type = vector_64_tag; }; -template <> struct neon_bitvector{ using type = int32x2_t; using tag_type = vector_64_tag; }; -template <> struct neon_bitvector{ using type = uint32x4_t; using tag_type = vector_128_tag; }; -template <> struct neon_bitvector{ using type = int32x4_t; using tag_type = vector_128_tag; }; -template <> struct neon_bitvector{ using type = uint64x1_t; using tag_type = vector_64_tag; }; -template <> struct neon_bitvector{ using type = int64x1_t; using tag_type = vector_64_tag; }; -template <> struct neon_bitvector{ using type = uint64x2_t; using tag_type = vector_128_tag; }; -template <> struct neon_bitvector{ using type = int64x2_t; using tag_type = vector_128_tag; }; -template <> struct neon_bitvector{ using type = float32x2_t; using tag_type = vector_64_tag; }; -template <> struct neon_bitvector{ using type = float32x4_t; using tag_type = vector_128_tag; }; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> struct neon_bitvector{ using type = float16x4_t; using tag_type = vector_64_tag; }; -template <> struct neon_bitvector{ using type = float16x8_t; using tag_type = vector_128_tag; }; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#endif /* DOXYGEN_SKIP_THIS */ - -/** Helper type template to get the type of a neon vector */ -template using neon_bitvector_t = typename neon_bitvector::type; -/** Helper type template to get the tag type of a neon vector */ -template using neon_bitvector_tag_t = typename neon_bitvector::tag_type; - -/** Promote a type */ -template struct promote { }; -template <> struct promote { using type = uint16_t; }; -template <> struct promote { using type = int16_t; }; -template <> struct promote { using type = uint32_t; }; -template <> struct promote { using type = int32_t; }; -template <> struct promote { using type = uint64_t; }; -template <> struct promote { using type = int64_t; }; -template <> struct promote { using type = float; }; -template <> struct promote { using type = half; }; - -/** Get promoted type */ -template -using promote_t = typename promote::type; - -// clang-format on -// *INDENT-ON* -} // namespace traits -} // namespace wrapper -} // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_TRAITS_H */ diff --git a/arm_compute/core/NEON/wrapper/wrapper.h b/arm_compute/core/NEON/wrapper/wrapper.h deleted file mode 100644 index e0c290887b..0000000000 --- a/arm_compute/core/NEON/wrapper/wrapper.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_WRAPPER_H -#define ARM_COMPUTE_WRAPPER_H - -// Traits -#include "arm_compute/core/NEON/wrapper/traits.h" - -// Intrinsics Overloads -#include "arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "arm_compute/core/NEON/wrapper/scalar/scalar.h" - -#endif /* ARM_COMPUTE_WRAPPER_H */ diff --git a/docs/ComputeLibrary.dir b/docs/ComputeLibrary.dir index 56451134d9..270e7fac9c 100644 --- a/docs/ComputeLibrary.dir +++ b/docs/ComputeLibrary.dir @@ -87,30 +87,6 @@ * @brief Folder containing all the NEON kernels */ -/** @dir arm_compute/core/NEON/kernels/detail - * @brief Common code for several intrinsics implementations. - */ - -/** @dir arm_compute/core/NEON/wrapper - * @brief NEON wrapper used to simplify code - */ - -/** @file arm_compute/core/NEON/wrapper/traits.h - * @brief Traits defined on NEON vectors - */ - -/** @file arm_compute/core/NEON/wrapper/wrapper.h - * @brief Includes all wrapper headers at once - */ - -/** @dir arm_compute/core/NEON/wrapper/intrinsics - * @brief NEON intrinsics wrappers - */ - -/** @dir arm_compute/core/NEON/wrapper/scalar - * @brief Scalar operations - */ - /** @dir arm_compute/core/utils * @brief Common core utilities. */ @@ -283,6 +259,30 @@ * @brief Source code implementing all the arm_compute headers. */ +/** @dir src/core/NEON/kernels/detail + * @brief Common code for several intrinsics implementations. + */ + +/** @dir src/core/NEON/wrapper + * @brief NEON wrapper used to simplify code + */ + +/** @file src/core/NEON/wrapper/traits.h + * @brief Traits defined on NEON vectors + */ + +/** @file src/core/NEON/wrapper/wrapper.h + * @brief Includes all wrapper headers at once + */ + +/** @dir src/core/NEON/wrapper/intrinsics + * @brief NEON intrinsics wrappers + */ + +/** @dir src/core/NEON/wrapper/scalar + * @brief Scalar operations + */ + /** @dir src/core/CL/cl_kernels * @brief All the OpenCL kernels */ diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h new file mode 100644 index 0000000000..70d48d5835 --- /dev/null +++ b/src/core/NEON/NEAsymm.h @@ -0,0 +1,753 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_NEASYMM_H +#define ARM_COMPUTE_NEASYMM_H + +#include "src/core/NEON/NEMath.h" +#include + +namespace arm_compute +{ +using qasymm8x8_t = uint8x8_t; /**< 8 bit quantized asymmetric vector with 8 elements */ +using qasymm8x8x2_t = uint8x8x2_t; /**< 8 bit quantized asymmetric vector with 16 elements */ +using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 24 elements */ +using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */ +using qasymm8x16_t = uint8x16_t; /**< 8 bit quantized asymmetric vector with 16 elements */ + +using qasymm8x8_signed_t = int8x8_t; /**< 8 bit quantized signed asymmetric vector with 8 elements */ +using qasymm8x8x2_signed_t = int8x8x2_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */ +using qasymm8x8x3_signed_t = int8x8x3_t; /**< 8 bit quantized signed asymmetric vector with 24 elements */ +using qasymm8x8x4_signed_t = int8x8x4_t; /**< 8 bit quantized signed asymmetric vector with 32 elements */ +using qasymm8x16_signed_t = int8x16_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */ + +/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector + * + * vd*vs + vo + * + * @param[in] vd Input vector value in QASYMM8 format + * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes. + * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes. + * + * @return A 16-component vector in QASYMM8 format, saturated to fit + */ +uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo); + +/** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector + * + * vd*vs + vo + * + * @param[in] vd Input vector value in QASYMM8_SIGNED format + * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes. + * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes. + * + * @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit + */ +int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo); + +/** Performs final quantization step on 16 elements + * + * @param[in] in_s32 Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_u8 Relu lower bound + * @param[in] max_u8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied + * + * @return Quantized values + */ +inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, + int result_fixedpoint_multiplier, + int32_t result_shift, + int32x4_t result_offset_after_shift_s32, + uint8x16_t min_u8, + uint8x16_t max_u8, + bool is_bounded_relu) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + if(result_shift < 0) + { + in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift))); + in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift))); + in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift))); + in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift))); + + in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + } + else + { + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + + // Round to the nearest division by a power-of-two using result_shift_s32 + in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); + in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift); + in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift); + in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift); + } + + // Add the offset terms + in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32); + in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32); + in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32); + in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to U8 + uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_u8 = vmaxq_u8(out_u8, min_u8); + out_u8 = vminq_u8(out_u8, max_u8); + } + + return out_u8; +} + +/** Performs final quantization step on 16 elements + * + * @param[in] in_s32 Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_s8 Relu lower bound + * @param[in] max_s8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied + * + * @return Quantized values + */ +inline int8x16_t finalize_quantization(int32x4x4_t &in_s32, + int result_fixedpoint_multiplier, + int32_t result_shift, + int32x4_t result_offset_after_shift_s32, + int8x16_t min_s8, + int8x16_t max_s8, + bool is_bounded_relu) +{ + if(result_shift < 0) + { + in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift))); + in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift))); + in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift))); + in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift))); + + in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + } + else + { + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + + // Round to the nearest division by a power-of-two using result_shift_s32 + in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); + in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift); + in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift); + in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift); + } + + // Add the offset terms + in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32); + in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32); + in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32); + in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to S8 + int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_s8 = vmaxq_s8(out_s8, min_s8); + out_s8 = vminq_s8(out_s8, max_s8); + } + + return out_s8; +} + +/** Performs final quantization step on 16 elements for symmetric quantization + * + * @param[in] in_s32 Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_s8 Relu lower bound + * @param[in] max_s8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied + * + * @return Quantized values + */ +inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, + const int32x4x4_t &result_fixedpoint_multiplier, + const int32x4x4_t &result_shift, + const int32x4_t &result_offset_after_shift_s32, + const int8x16_t &min_s8, + const int8x16_t &max_s8, + const bool is_bounded_relu) +{ + const static int32x4_t one_s32 = vdupq_n_s32(1); + + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + int32x4x4_t res_shift_gt0 = + { + vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]), + vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]), + vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]), + vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]), + }; + // Round to the nearest division by a power-of-two using result_shift_s32 + res_shift_gt0.val[0] = rounding_divide_by_pow2(res_shift_gt0.val[0], result_shift.val[0]); + res_shift_gt0.val[1] = rounding_divide_by_pow2(res_shift_gt0.val[1], result_shift.val[1]); + res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]); + res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]); + + int32x4x4_t res_shift_lt0 = + { + vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))), + vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))), + vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))), + vmulq_s32(in_s32.val[3], vshlq_s32(one_s32, vnegq_s32(result_shift.val[3]))), + }; + res_shift_lt0.val[0] = vqrdmulhq_s32(res_shift_lt0.val[0], result_fixedpoint_multiplier.val[0]); + res_shift_lt0.val[1] = vqrdmulhq_s32(res_shift_lt0.val[1], result_fixedpoint_multiplier.val[1]); + res_shift_lt0.val[2] = vqrdmulhq_s32(res_shift_lt0.val[2], result_fixedpoint_multiplier.val[2]); + res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]); + + // Select result depending on shift value + const uint32x4x4_t mask_lt0 = + { +#ifdef __aarch64__ + vcltzq_s32(result_shift.val[0]), + vcltzq_s32(result_shift.val[1]), + vcltzq_s32(result_shift.val[2]), + vcltzq_s32(result_shift.val[3]), +#else //__aarch64__ + vcltq_s32(result_shift.val[0], vdupq_n_s32(0)), + vcltq_s32(result_shift.val[1], vdupq_n_s32(0)), + vcltq_s32(result_shift.val[2], vdupq_n_s32(0)), + vcltq_s32(result_shift.val[3], vdupq_n_s32(0)), +#endif //__aarch64__ + }; + + in_s32.val[0] = vbslq_s32(mask_lt0.val[0], res_shift_lt0.val[0], res_shift_gt0.val[0]); + in_s32.val[1] = vbslq_s32(mask_lt0.val[1], res_shift_lt0.val[1], res_shift_gt0.val[1]); + in_s32.val[2] = vbslq_s32(mask_lt0.val[2], res_shift_lt0.val[2], res_shift_gt0.val[2]); + in_s32.val[3] = vbslq_s32(mask_lt0.val[3], res_shift_lt0.val[3], res_shift_gt0.val[3]); + + // Add the offset terms + in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32); + in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32); + in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32); + in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to S8 + int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_s8 = vmaxq_s8(out_s8, min_s8); + out_s8 = vminq_s8(out_s8, max_s8); + } + + return out_s8; +} + +/** Performs final quantization step on single element + * + * @param[in] in_value Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_u8 Relu lower bound + * @param[in] max_u8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied + * + * @return Quantized value + */ +inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, + int32_t result_shift, int32_t result_offset_after_shift_s32, + uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu) +{ + int32x4_t in_s32 = vdupq_n_s32(in_value); + + if(result_shift < 0) + { + in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); + } + else + { + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); + // Shift value by result_shift_s32 + in_value = rounding_divide_by_pow2(in_value, result_shift); + } + + // Add the offset term + in_value += result_offset_after_shift_s32; + + // Bound the result + uint8_t out_u8 = static_cast(std::max(0, std::min(255, in_value))); + if(is_bounded_relu) + { + out_u8 = static_cast(std::max(min_u8, std::min(max_u8, out_u8))); + } + + return out_u8; +} + +/** Performs final quantization step on single element + * + * @param[in] in_value Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] result_offset_after_shift_s32 Result offset parameter + * @param[in] min_s8 Relu lower bound + * @param[in] max_s8 Relu upper bound + * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied + * + * @return Quantized value + */ +inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, + int32_t result_shift, int32_t result_offset_after_shift_s32, + int8_t min_s8, int8_t max_s8, bool is_bounded_relu) +{ + int32x4_t in_s32 = vdupq_n_s32(in_value); + + if(result_shift < 0) + { + in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); + } + else + { + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); + + // Shift value by result_shift_s32 + in_value = rounding_divide_by_pow2(in_value, result_shift); + } + + // Add the offset term + in_value += result_offset_after_shift_s32; + + // Bound the result + int8_t out_s8 = static_cast(std::max(-128, std::min(127, in_value))); + if(is_bounded_relu) + { + out_s8 = static_cast(std::max(min_s8, std::min(max_s8, out_s8))); + } + + return out_s8; +} + +/** Dequantize a neon vector holding 8 quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x2_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale), + } + }; + return vdequantized_input; +} + +/** Dequantize a neon vector holding 8 singed quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x2_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale), + } + }; + return vdequantized_input; +} + +/** Dequantize a neon vector holding 16 quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), + } + }; + return vdequantized_input; +} + +/** Dequantize a neon vector holding 16 signed quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + } + }; + return vdequantized_input; +} + +/** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] scale Quantization scaling factor. + * @param[in] offset Zero quantization offset. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset) +{ + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), + } + }; + return vdequantized_input; +} + +/** Dequantize a vector of 16 values stored as signed asymmetric. + * + * @param[in] qv Input values to be dequantized. + * @param[in] scale Quantization scaling factor. + * @param[in] offset Zero quantization offset. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset) +{ + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + } + }; + return vdequantized_input; +} + +/** Dequantize following symmetric quantization scheme a neon vector holding 16 quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] vscale Vector containing quantization scaling factors. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale) +{ + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]), + } + }; + return vdequantized_input; +} + +/** Dequantize following a symmetric quantization scheme a neon vector holding 16 quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] scale Quantization scaling factor. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale) +{ + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale), + } + }; + return vdequantized_input; +} + +/** Quantize a neon vector holding 8 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the quantized values + */ +inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const float32x4_t voffset = vdupq_n_f32(offset); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), +#endif //__aarch64__ + } + }; + return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); +} + +/** Quantize a neon vector holding 8 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the singed quantized values + */ +inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const float32x4_t voffset = vdupq_n_f32(offset); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), +#endif //__aarch64__ + } + }; + return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); +} + +/** Quantize a neon vector holding 16 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the quantized values + */ +inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const float32x4_t voffset = vdupq_n_f32(offset); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#endif //__aarch64__ + } + }; + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + return vcombine_u8(pa, pb); +} + +/** Signed quantize a neon vector holding 16 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the quantized values + */ +inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const float32x4_t voffset = vdupq_n_f32(offset); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#endif //__aarch64__ + } + }; + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + return vcombine_s8(pa, pb); +} + +/** Quantize to QASYMM16 a neon vector holding 16 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the quantized values + */ +inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const float32x4_t voffset = vdupq_n_f32(offset); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#endif //__aarch64__ + } + }; + const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1])); + const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3])); + return { pa, pb }; +} +} // namespace arm_compute +#include "src/core/NEON/NEAsymm.inl" +#endif // ARM_COMPUTE_NEASYMM_H diff --git a/src/core/NEON/NEAsymm.inl b/src/core/NEON/NEAsymm.inl new file mode 100644 index 0000000000..6ee1a336b8 --- /dev/null +++ b/src/core/NEON/NEAsymm.inl @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +namespace arm_compute +{ +inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo) +{ + // Convert uint8 vectors to uint16 vectors + const uint8x8_t vd_low = vget_low_u8(vd); + const uint8x8_t vd_high = vget_high_u8(vd); + uint16x8_t vd_low_u16x8 = vmovl_u8(vd_low); + uint16x8_t vd_high_u16x8 = vmovl_u8(vd_high); + // Convert uint16 vectors to uint32 vectors + uint32x4_t A_u32x4 = vmovl_u16(vget_low_u16(vd_low_u16x8)); + uint32x4_t B_u32x4 = vmovl_u16(vget_high_u16(vd_low_u16x8)); + uint32x4_t C_u32x4 = vmovl_u16(vget_low_u16(vd_high_u16x8)); + uint32x4_t D_u32x4 = vmovl_u16(vget_high_u16(vd_high_u16x8)); + // Convert uint32 vectors to float32 vectors + float32x4_t A_f32x4 = vcvtq_f32_u32(A_u32x4); + float32x4_t B_f32x4 = vcvtq_f32_u32(B_u32x4); + float32x4_t C_f32x4 = vcvtq_f32_u32(C_u32x4); + float32x4_t D_f32x4 = vcvtq_f32_u32(D_u32x4); + // vd = vd*vs + vo + A_f32x4 = vmlaq_f32(vo, A_f32x4, vs); + B_f32x4 = vmlaq_f32(vo, B_f32x4, vs); + C_f32x4 = vmlaq_f32(vo, C_f32x4, vs); + D_f32x4 = vmlaq_f32(vo, D_f32x4, vs); + // Convert float32 vectors to uint32 vectors + A_u32x4 = vcvtq_u32_f32(A_f32x4); + B_u32x4 = vcvtq_u32_f32(B_f32x4); + C_u32x4 = vcvtq_u32_f32(C_f32x4); + D_u32x4 = vcvtq_u32_f32(D_f32x4); + // Convert uint32 vectors to uint16 vectors (with saturation) + vd_low_u16x8 = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4)); + vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4)); + // convert uint16 vectors to uint8 vectors (with saturation) + return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8)); +} +inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo) +{ + // Convert uint8 vectors to int16 vectors + const int8x8_t vd_low = vget_low_s8(vd); + const int8x8_t vd_high = vget_high_s8(vd); + int16x8_t vd_low_s16x8 = vmovl_s8(vd_low); + int16x8_t vd_high_s16x8 = vmovl_s8(vd_high); + // Convert int16 vectors to int32 vectors + int32x4_t A_s32x4 = vmovl_s16(vget_low_s16(vd_low_s16x8)); + int32x4_t B_s32x4 = vmovl_s16(vget_high_s16(vd_low_s16x8)); + int32x4_t C_s32x4 = vmovl_s16(vget_low_s16(vd_high_s16x8)); + int32x4_t D_s32x4 = vmovl_s16(vget_high_s16(vd_high_s16x8)); + // Convert int32 vectors to float32 vectors + float32x4_t A_f32x4 = vcvtq_f32_s32(A_s32x4); + float32x4_t B_f32x4 = vcvtq_f32_s32(B_s32x4); + float32x4_t C_f32x4 = vcvtq_f32_s32(C_s32x4); + float32x4_t D_f32x4 = vcvtq_f32_s32(D_s32x4); + // vd = vd*vs + vo + A_f32x4 = vmlaq_f32(vo, A_f32x4, vs); + B_f32x4 = vmlaq_f32(vo, B_f32x4, vs); + C_f32x4 = vmlaq_f32(vo, C_f32x4, vs); + D_f32x4 = vmlaq_f32(vo, D_f32x4, vs); + // Convert float32 vectors to int32 vectors + A_s32x4 = vcvtq_s32_f32(A_f32x4); + B_s32x4 = vcvtq_s32_f32(B_f32x4); + C_s32x4 = vcvtq_s32_f32(C_f32x4); + D_s32x4 = vcvtq_s32_f32(D_f32x4); + // Convert int32 vectors to int16 vectors (with saturation) + vd_low_s16x8 = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4)); + vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4)); + // convert int16 vectors to int8 vectors (with saturation) + return vcombine_s8(vqmovn_s16(vd_low_s16x8), vqmovn_s16(vd_high_s16x8)); +} +} // namespace arm_compute diff --git a/src/core/NEON/NEFixedPoint.h b/src/core/NEON/NEFixedPoint.h new file mode 100644 index 0000000000..5c49b25c3e --- /dev/null +++ b/src/core/NEON/NEFixedPoint.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_NEFIXEDPOINT_H +#define ARM_COMPUTE_NEFIXEDPOINT_H + +#include + +namespace arm_compute +{ +/** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements + * + * @param[in] a Float input vector + * @param[in] b Float input vector + * + * @return The lane-by-lane maximum -> float32x4x2 + */ +float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b); +} // namespace arm_compute +#include "src/core/NEON/NEFixedPoint.inl" +#endif /* ARM_COMPUTE_NEFIXEDPOINT_H */ \ No newline at end of file diff --git a/src/core/NEON/NEFixedPoint.inl b/src/core/NEON/NEFixedPoint.inl new file mode 100644 index 0000000000..8bff9c4a8e --- /dev/null +++ b/src/core/NEON/NEFixedPoint.inl @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +namespace arm_compute +{ +#ifndef DOXYGEN_SKIP_THIS + +inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b) +{ + float32x4x2_t res = + { + { + vmaxq_f32(a.val[0], b.val[0]), + vmaxq_f32(a.val[1], b.val[1]) + } + }; + return res; +} +#endif /* DOXYGEN_SKIP_THIS */ +} // namespace arm_compute diff --git a/src/core/NEON/NEMath.h b/src/core/NEON/NEMath.h new file mode 100644 index 0000000000..877ffb2827 --- /dev/null +++ b/src/core/NEON/NEMath.h @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2016-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_NEMATH_H +#define ARM_COMPUTE_NEMATH_H + +#include +#include + +namespace arm_compute +{ +/** Calculate floor of a vector. + * + * @param[in] val Input vector value in F32 format. + * + * @return The calculated floor vector. + */ +float32x4_t vfloorq_f32(float32x4_t val); + +/** Calculate round value of a vector to nearest with ties to even. + * + * @param[in] val Input vector value in F32 format. + * + * @return The calculated round vector. + */ +float32x4_t vroundq_rte_f32(float32x4_t val); + +/** Calculate inverse square root. + * + * @param[in] x Input value. + * + * @return The calculated inverse square root. + */ +float32x2_t vinvsqrt_f32(float32x2_t x); + +/** Calculate inverse square root. + * + * @param[in] x Input value. + * + * @return The calculated inverse square root. + */ +float32x4_t vinvsqrtq_f32(float32x4_t x); + +/** Calculate reciprocal. + * + * @param[in] x Input value. + * + * @return The calculated reciprocal. + */ +float32x2_t vinv_f32(float32x2_t x); + +/** Calculate reciprocal. + * + * @param[in] x Input value. + * + * @return The calculated reciprocal. + */ +float32x4_t vinvq_f32(float32x4_t x); + +/** Perform a 7th degree polynomial approximation using Estrin's method. + * + * @param[in] x Input vector value in F32 format. + * @param[in] coeffs Polynomial coefficients table. + * + * @return The calculated approximation. + */ +float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array &coeffs); + +/** Calculate exponential + * + * @param[in] x Input vector value in F32 format. + * + * @return The calculated exponent. + */ +float32x4_t vexpq_f32(float32x4_t x); + +/** Calculate logarithm + * + * @param[in] x Input vector value in F32 format. + * + * @return The calculated logarithm. + */ +float32x4_t vlogq_f32(float32x4_t x); + +/** Calculate hyperbolic tangent. + * + * tanh(x) = (e^2x - 1)/(e^2x + 1) + * + * @note We clamp x to [-5,5] to avoid overflowing issues. + * + * @param[in] val Input vector value in F32 format. + * + * @return The calculated Hyperbolic Tangent. + */ +float32x4_t vtanhq_f32(float32x4_t val); + +/** Calculate n power of a number. + * + * pow(x,n) = e^(n*log(x)) + * + * @param[in] val Input vector value in F32 format. + * @param[in] n Powers to raise the input to. + * + * @return The calculated power. + */ +float32x4_t vpowq_f32(float32x4_t val, float32x4_t n); + +/** Round to the nearest division by a power-of-two using exponent + * + * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent + * + * @param[in] x Vector of 4 elements + * @param[in] exponent Vector of 4 elements with integer value used to round to nearest division by a power-of-two + * + * @return the nearest division by a power-of-two using exponent + */ +int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent); + +/** Round to the nearest division by a power-of-two using exponent + * + * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent + * + * @param[in] x Vector of 4 elements + * @param[in] exponent Integer value used to round to nearest division by a power-of-two + * + * @return the nearest division by a power-of-two using exponent + */ +int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent); + +/** Round to the nearest division by a power-of-two using exponent + * + * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent + * + * @param[in] x Element to divide. + * @param[in] exponent Integer value used to round to nearest division by a power-of-two + * + * @return the nearest division by a power-of-two using exponent + */ +int32_t rounding_divide_by_pow2(int32_t x, int exponent); + +/** Converts from uint8x16 to float32x4x4_t + * + * @param[in] in Vector of uint8 to be converted + * + * @return Converted vector of float + */ +float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in); + +/** Converts from int8x16 to float32x4x4_t + * + * @param[in] in Vector of int8 to be converted + * + * @return Converted vector of float + */ +float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in); + +/** Converts to float32x4x4_t from the specified templated 16 elements vectors + * + * @param[in] in Vector of float to be converted + * + * @return Converted vector of float + */ +template +float32x4x4_t convert_to_float32x4x4(const T &in); + +/** Converts from two float32x4x3_t to just one uint8x8x3_t + * + * @param[in] in1 First input vector of float to be converted + * @param[in] in2 Second input vector of float to be converted + * @param[out] out Converted output vector uint8 to store the result + */ +void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out); + +/** Converts from two float32x4x4_t to just one uint8x16_t + * + * @param[in] in Vector of float to be converted + * @param[out] out Converted vector of uint8 to store the result + */ +void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out); + +/** Converts from float32x4x4_t to just one int8x16_t + * + * @param[in] in Vector of float to be converted + * @param[out] out Converted vector of uint8 to store the result + */ +void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out); + +/** Calculate sine. + * + * @param[in] val Input vector value in radians, F32 format. + * + * @return The calculated sine. + */ +float32x4_t vsinq_f32(float32x4_t val); + +/** Calculate sine. + * + * @param[in] val Input vector value in radians, F32 format. + * + * @return The calculated sine. + */ +float32x2_t vsin_f32(float32x2_t val); + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +/** Calculate hyperbolic tangent. + * + * tanh(x) = (e^2x - 1)/(e^2x + 1) + * + * @note We clamp x to [-5,5] to avoid overflowing issues. + * + * @param[in] val Input vector value in F16 format. + * + * @return The calculated Hyperbolic Tangent. + */ +float16x8_t vtanhq_f16(float16x8_t val); + +/** Calculate round value of a vector to nearest with ties to even. + * + * @param[in] val Input vector value in F16 format. + * + * @return The calculated round vector. + */ +float16x8_t vroundq_rte_f16(float16x8_t val); + +/** Calculate reciprocal. + * + * @param[in] x Input value. + * + * @return The calculated reciprocal. + */ +float16x4_t vinv_f16(float16x4_t x); + +/** Calculate reciprocal. + * + * @param[in] x Input value. + * + * @return The calculated reciprocal. + */ +float16x8_t vinvq_f16(float16x8_t x); + +/** Calculate inverse square root. + * + * @param[in] x Input value. + * + * @return The calculated inverse square root. + */ +float16x4_t vinvsqrt_f16(float16x4_t x); + +/** Calculate inverse square root. + * + * @param[in] x Input value. + * + * @return The calculated inverse square root. + */ +float16x8_t vinvsqrtq_f16(float16x8_t x); + +/** Calculate exponential + * + * @param[in] x Input vector value in F16 format. + * + * @return The calculated exponent. + */ +float16x8_t vexpq_f16(float16x8_t x); + +/** Calculate n power of a number. + * + * pow(x,n) = e^(n*log(x)) + * + * @param[in] val Input vector value in F16 format. + * @param[in] n Powers to raise the input to. + * + * @return The calculated power. + */ +float16x8_t vpowq_f16(float16x8_t val, float16x8_t n); + +/** Calculate sine. + * + * @param[in] val Input vector value in radians, F16 format. + * + * @return The calculated sine. + */ +float16x8_t vsinq_f16(float16x8_t val); + +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +} // namespace arm_compute +#include "src/core/NEON/NEMath.inl" +#endif /* ARM_COMPUTE_NEMATH_H */ diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl new file mode 100644 index 0000000000..a1c3d41880 --- /dev/null +++ b/src/core/NEON/NEMath.inl @@ -0,0 +1,529 @@ +/* + * Copyright (c) 2016-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +#ifndef M_PI +#define M_PI (3.14159265358979323846) +#endif // M_PI + +namespace arm_compute +{ +/** Exponent polynomial coefficients */ +const std::array exp_tab = +{ + { + vdupq_n_f32(1.f), + vdupq_n_f32(0.0416598916054f), + vdupq_n_f32(0.500000596046f), + vdupq_n_f32(0.0014122662833f), + vdupq_n_f32(1.00000011921f), + vdupq_n_f32(0.00833693705499f), + vdupq_n_f32(0.166665703058f), + vdupq_n_f32(0.000195780929062f), + } +}; + +/** Logarithm polynomial coefficients */ +const std::array log_tab = +{ + { + vdupq_n_f32(-2.29561495781f), + vdupq_n_f32(-2.47071170807f), + vdupq_n_f32(-5.68692588806f), + vdupq_n_f32(-0.165253549814f), + vdupq_n_f32(5.17591238022f), + vdupq_n_f32(0.844007015228f), + vdupq_n_f32(4.58445882797f), + vdupq_n_f32(0.0141278216615f), + } +}; + +/** Sin polynomial coefficients */ +constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3) +constexpr float te_sin_coeff3 = 0.05f; // 1/(4*5) +constexpr float te_sin_coeff4 = 0.023809523810f; // 1/(6*7) +constexpr float te_sin_coeff5 = 0.013888888889f; // 1/(8*9) + +#ifndef DOXYGEN_SKIP_THIS +inline float32x4_t vfloorq_f32(float32x4_t val) +{ + static const float32x4_t CONST_1 = vdupq_n_f32(1.f); + + const int32x4_t z = vcvtq_s32_f32(val); + const float32x4_t r = vcvtq_f32_s32(z); + + return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, CONST_1), r); +} + +inline float32x4_t vroundq_rte_f32(float32x4_t val) +{ +#ifdef __aarch64__ + return vrndnq_f32(val); +#else // __aarch64__ + static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f); + static const float32x4_t CONST_1_FLOAT = vdupq_n_f32(1.f); + static const int32x4_t CONST_1_INT = vdupq_n_s32(1); + const float32x4_t floor_val = vfloorq_f32(val); + const float32x4_t diff = vsubq_f32(val, floor_val); + + /* + * Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0). + * This condition is checked by vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))) + */ + + return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))), + floor_val, vaddq_f32(floor_val, CONST_1_FLOAT)); +#endif // __aarch64__ +} + +inline float32x2_t vinvsqrt_f32(float32x2_t x) +{ + float32x2_t sqrt_reciprocal = vrsqrte_f32(x); + sqrt_reciprocal = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + + return sqrt_reciprocal; +} + +inline float32x4_t vinvsqrtq_f32(float32x4_t x) +{ + float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + + return sqrt_reciprocal; +} + +inline float32x2_t vinv_f32(float32x2_t x) +{ + float32x2_t recip = vrecpe_f32(x); + recip = vmul_f32(vrecps_f32(x, recip), recip); + recip = vmul_f32(vrecps_f32(x, recip), recip); + return recip; +} + +inline float32x4_t vinvq_f32(float32x4_t x) +{ + float32x4_t recip = vrecpeq_f32(x); + recip = vmulq_f32(vrecpsq_f32(x, recip), recip); + recip = vmulq_f32(vrecpsq_f32(x, recip), recip); + return recip; +} + +inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array &coeffs) +{ + float32x4_t A = vmlaq_f32(coeffs[0], coeffs[4], x); + float32x4_t B = vmlaq_f32(coeffs[2], coeffs[6], x); + float32x4_t C = vmlaq_f32(coeffs[1], coeffs[5], x); + float32x4_t D = vmlaq_f32(coeffs[3], coeffs[7], x); + float32x4_t x2 = vmulq_f32(x, x); + float32x4_t x4 = vmulq_f32(x2, x2); + float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4); + return res; +} + +inline float32x4_t vexpq_f32(float32x4_t x) +{ + static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) + static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2) + static const float32x4_t CONST_INF = vdupq_n_f32(std::numeric_limits::infinity()); + static const float32x4_t CONST_MAX_INPUT = vdupq_n_f32(88.7f); + static const float32x4_t CONST_0 = vdupq_n_f32(0.f); + static const int32x4_t CONST_NEGATIVE_126 = vdupq_n_s32(-126); + + // Perform range reduction [-log(2),log(2)] + int32x4_t m = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2)); + float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2); + + // Polynomial Approximation + float32x4_t poly = vtaylor_polyq_f32(val, exp_tab); + + // Reconstruct + poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23))); + poly = vbslq_f32(vcltq_s32(m, CONST_NEGATIVE_126), CONST_0, poly); // Handle underflow + poly = vbslq_f32(vcgtq_f32(x, CONST_MAX_INPUT), CONST_INF, poly); // Handle overflow + + return poly; +} + +inline float32x4_t vlogq_f32(float32x4_t x) +{ + static const int32x4_t CONST_127 = vdupq_n_s32(127); // 127 + static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) + + // Extract exponent + int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127); + float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23))); + + // Polynomial Approximation + float32x4_t poly = vtaylor_polyq_f32(val, log_tab); + + // Reconstruct + poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2); + + return poly; +} + +inline float32x4_t vtanhq_f32(float32x4_t val) +{ + static const float32x4_t CONST_1 = vdupq_n_f32(1.f); + static const float32x4_t CONST_2 = vdupq_n_f32(2.f); + static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-10.f); + static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(10.f); + + float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH); + float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x)); + float32x4_t num = vsubq_f32(exp2x, CONST_1); + float32x4_t den = vaddq_f32(exp2x, CONST_1); + float32x4_t tanh = vmulq_f32(num, vinvq_f32(den)); + return tanh; +} + +inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n) +{ + return vexpq_f32(vmulq_f32(n, vlogq_f32(val))); +} + +inline float32x4_t vsinq_f32(float32x4_t val) +{ + const float32x4_t pi_v = vdupq_n_f32(M_PI); + const float32x4_t pio2_v = vdupq_n_f32(M_PI / 2); + const float32x4_t ipi_v = vdupq_n_f32(1 / M_PI); + + //Find positive or negative + const int32x4_t c_v = vabsq_s32(vcvtq_s32_f32(vmulq_f32(val, ipi_v))); + const uint32x4_t sign_v = vcleq_f32(val, vdupq_n_f32(0)); + const uint32x4_t odd_v = vandq_u32(vreinterpretq_u32_s32(c_v), vdupq_n_u32(1)); + + uint32x4_t neg_v = veorq_u32(odd_v, sign_v); + + //Modulus a - (n * int(a*(1/n))) + float32x4_t ma = vsubq_f32(vabsq_f32(val), vmulq_f32(pi_v, vcvtq_f32_s32(c_v))); + const uint32x4_t reb_v = vcgeq_f32(ma, pio2_v); + + //Rebase a between 0 and pi/2 + ma = vbslq_f32(reb_v, vsubq_f32(pi_v, ma), ma); + + //Taylor series + const float32x4_t ma2 = vmulq_f32(ma, ma); + + //2nd elem: x^3 / 3! + float32x4_t elem = vmulq_f32(vmulq_f32(ma, ma2), vdupq_n_f32(te_sin_coeff2)); + float32x4_t res = vsubq_f32(ma, elem); + + //3rd elem: x^5 / 5! + elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff3)); + res = vaddq_f32(res, elem); + + //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val) + elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff4)); + res = vsubq_f32(res, elem); + + //5th elem: x^9 / 9! + elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff5)); + res = vaddq_f32(res, elem); + + //Change of sign + neg_v = vshlq_n_u32(neg_v, 31); + res = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(res), neg_v)); + return res; +} + +inline float32x2_t vsin_f32(float32x2_t val) +{ + const float32x2_t pi_v = vdup_n_f32(M_PI); + const float32x2_t pio2_v = vdup_n_f32(M_PI / 2); + const float32x2_t ipi_v = vdup_n_f32(1 / M_PI); + + //Find positive or negative + const int32x2_t c_v = vabs_s32(vcvt_s32_f32(vmul_f32(val, ipi_v))); + const uint32x2_t sign_v = vcle_f32(val, vdup_n_f32(0)); + const uint32x2_t odd_v = vand_u32(vreinterpret_u32_s32(c_v), vdup_n_u32(1)); + + uint32x2_t neg_v = veor_u32(odd_v, sign_v); + + //Modulus a - (n * int(a*(1/n))) + float32x2_t ma = vsub_f32(vabs_f32(val), vmul_f32(pi_v, vcvt_f32_s32(c_v))); + const uint32x2_t reb_v = vcge_f32(ma, pio2_v); + + //Rebase a between 0 and pi/2 + ma = vbsl_f32(reb_v, vsub_f32(pi_v, ma), ma); + + //Taylor series + const float32x2_t ma2 = vmul_f32(ma, ma); + + //2nd elem: x^3 / 3! + float32x2_t elem = vmul_f32(vmul_f32(ma, ma2), vdup_n_f32(te_sin_coeff2)); + float32x2_t res = vsub_f32(ma, elem); + + //3rd elem: x^5 / 5! + elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff3)); + res = vadd_f32(res, elem); + + //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val) + elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff4)); + res = vsub_f32(res, elem); + + //5th elem: x^9 / 9! + elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff5)); + res = vadd_f32(res, elem); + + //Change of sign + neg_v = vshl_n_u32(neg_v, 31); + res = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(res), neg_v)); + return res; +} + +#endif /* DOXYGEN_SKIP_THIS */ + +inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent) +{ + const int32x4_t shift_vec = vnegq_s32(exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); + const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed_up_x, shift_vec); +} + +inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent) +{ + const int32x4_t shift_vec = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); + const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed_up_x, shift_vec); +} + +inline int32_t rounding_divide_by_pow2(int32_t x, int exponent) +{ + const int32_t mask = (1 << exponent) - 1; + const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0); + return (x >> exponent) + ((x & mask) > threshold ? 1 : 0); +} + +inline float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in) +{ + float32x4x4_t out; + + const auto tmp1 = vmovl_u8(vget_low_u8(in)); + out.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1))); + out.val[1] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1))); + + const auto tmp2 = vmovl_u8(vget_high_u8(in)); + out.val[2] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2))); + out.val[3] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2))); + return out; +} + +inline float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in) +{ + float32x4x4_t out; + + const auto tmp1 = vmovl_s8(vget_low_s8(in)); + out.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp1))); + out.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp1))); + + const auto tmp2 = vmovl_s8(vget_high_s8(in)); + out.val[2] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp2))); + out.val[3] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp2))); + return out; +} + +template <> +inline float32x4x4_t convert_to_float32x4x4(const uint8x16_t &in) +{ + return convert_uint8x16_to_float32x4x4(in); +} + +template <> +inline float32x4x4_t convert_to_float32x4x4(const int8x16_t &in) +{ + return convert_int8x16_to_float32x4x4(in); +} + +inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out) +{ + out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), + vqmovn_u32(vcvtq_u32_f32(in2.val[0])))); + out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), + vqmovn_u32(vcvtq_u32_f32(in2.val[1])))); + out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), + vqmovn_u32(vcvtq_u32_f32(in2.val[2])))); +} + +inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out) +{ + const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), + vqmovn_u32(vcvtq_u32_f32(in.val[1]))); + const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), + vqmovn_u32(vcvtq_u32_f32(in.val[3]))); + out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); +} + +inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out) +{ + const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), + vqmovn_s32(vcvtq_s32_f32(in.val[1]))); + const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), + vqmovn_s32(vcvtq_s32_f32(in.val[3]))); + out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +/** Exponent polynomial coefficients */ +/** Logarithm polynomial coefficients */ +#ifndef DOXYGEN_SKIP_THIS +inline float16x8_t vfloorq_f16(float16x8_t val) +{ + static const float16x8_t CONST_1 = vdupq_n_f16(1.f); + + const int16x8_t z = vcvtq_s16_f16(val); + const float16x8_t r = vcvtq_f16_s16(z); + + return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, CONST_1), r); +} + +inline float16x8_t vroundq_rte_f16(float16x8_t val) +{ + return vrndnq_f16(val); +} + +inline float16x4_t vinvsqrt_f16(float16x4_t x) +{ + float16x4_t sqrt_reciprocal = vrsqrte_f16(x); + sqrt_reciprocal = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + return sqrt_reciprocal; +} + +inline float16x8_t vinvsqrtq_f16(float16x8_t x) +{ + float16x8_t sqrt_reciprocal = vrsqrteq_f16(x); + sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + return sqrt_reciprocal; +} + +inline float16x4_t vinv_f16(float16x4_t x) +{ + float16x4_t recip = vrecpe_f16(x); + recip = vmul_f16(vrecps_f16(x, recip), recip); + recip = vmul_f16(vrecps_f16(x, recip), recip); + return recip; +} + +inline float16x8_t vinvq_f16(float16x8_t x) +{ + float16x8_t recip = vrecpeq_f16(x); + recip = vmulq_f16(vrecpsq_f16(x, recip), recip); + recip = vmulq_f16(vrecpsq_f16(x, recip), recip); + return recip; +} + +inline float16x8_t vtanhq_f16(float16x8_t val) +{ + const float16x8_t CONST_1 = vdupq_n_f16(1.f); + const float16x8_t CONST_2 = vdupq_n_f16(2.f); + const float16x8_t CONST_MIN_TANH = vdupq_n_f16(-10.f); + const float16x8_t CONST_MAX_TANH = vdupq_n_f16(10.f); + + const float16x8_t x = vminq_f16(vmaxq_f16(val, CONST_MIN_TANH), CONST_MAX_TANH); + const float16x8_t exp2x = vexpq_f16(vmulq_f16(CONST_2, x)); + const float16x8_t num = vsubq_f16(exp2x, CONST_1); + const float16x8_t den = vaddq_f16(exp2x, CONST_1); + const float16x8_t tanh = vmulq_f16(num, vinvq_f16(den)); + return tanh; +} + +inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array &coeffs) +{ + const float16x8_t A = vaddq_f16(coeffs[0], vmulq_f16(coeffs[4], x)); + const float16x8_t B = vaddq_f16(coeffs[2], vmulq_f16(coeffs[6], x)); + const float16x8_t C = vaddq_f16(coeffs[1], vmulq_f16(coeffs[5], x)); + const float16x8_t D = vaddq_f16(coeffs[3], vmulq_f16(coeffs[7], x)); + const float16x8_t x2 = vmulq_f16(x, x); + const float16x8_t x4 = vmulq_f16(x2, x2); + const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4)); + return res; +} + +inline float16x8_t vexpq_f16(float16x8_t x) +{ + // TODO (COMPMID-1535) : Revisit FP16 approximations + const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x)); + const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x)); + + const float16x8_t res = vcombine_f16(vcvt_f16_f32(vexpq_f32(x_low)), vcvt_f16_f32(vexpq_f32(x_high))); + return res; +} + +inline float16x8_t vlogq_f16(float16x8_t x) +{ + // TODO (COMPMID-1535) : Revisit FP16 approximations + const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x)); + const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x)); + + const float16x8_t res = vcombine_f16(vcvt_f16_f32(vlogq_f32(x_low)), vcvt_f16_f32(vlogq_f32(x_high))); + return res; +} + +inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n) +{ + // TODO (giaiod01) - COMPMID-1535 + float32x4_t n0_f32 = vcvt_f32_f16(vget_low_f16(n)); + float32x4_t n1_f32 = vcvt_f32_f16(vget_high_f16(n)); + float32x4_t val0_f32 = vcvt_f32_f16(vget_low_f16(val)); + float32x4_t val1_f32 = vcvt_f32_f16(vget_high_f16(val)); + + float32x4_t res0_f32 = vexpq_f32(vmulq_f32(n0_f32, vlogq_f32(val0_f32))); + float32x4_t res1_f32 = vexpq_f32(vmulq_f32(n1_f32, vlogq_f32(val1_f32))); + + return vcombine_f16(vcvt_f16_f32(res0_f32), vcvt_f16_f32(res1_f32)); +} + +inline float16x8_t vsinq_f16(float16x8_t val) +{ + const float32x4_t val_high = vcvt_f32_f16(vget_high_f16(val)); + const float32x4_t val_low = vcvt_f32_f16(vget_low_f16(val)); + + const float32x4_t res_high = vsinq_f32(val_high); + const float32x4_t res_low = vsinq_f32(val_low); + + return vcombine_f16(vcvt_f16_f32(res_low), vcvt_f16_f32(res_high)); +} + +inline float16x4_t vsin_f16(float16x4_t val) +{ + const float32x4_t val_f32 = vcvt_f32_f16(val); + const float32x2_t val_high = vget_high_f32(val_f32); + const float32x2_t val_low = vget_low_f32(val_f32); + + const float32x2_t res_high = vsin_f32(val_high); + const float32x2_t res_low = vsin_f32(val_low); + + return vcvt_f16_f32(vcombine_f32(res_low, res_high)); +} + +#endif /* DOXYGEN_SKIP_THIS */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +} // namespace arm_compute diff --git a/src/core/NEON/NESymm.h b/src/core/NEON/NESymm.h new file mode 100644 index 0000000000..e6644577a1 --- /dev/null +++ b/src/core/NEON/NESymm.h @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_NESYMM_H +#define ARM_COMPUTE_NESYMM_H + +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/NEON/NEMath.h" +#include + +namespace arm_compute +{ +using qsymm8_t = int8_t; /**< 8 bit quantized symmetric scalar value */ +using qsymm16_t = int16_t; /**< 16 bit quantized symmetric scalar value */ + +using qsymm16x8_t = int16x8_t; /**< 16 bit quantized symmetric vector with 8 elements */ +using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 16 elements */ + +/** Performs final quantization step on 8 signed 16-bit elements + * + * @tparam is_bounded_relu Specified if a fused bounded relu should be applied + * + * @param[in] in_s32 Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] min_s16 Relu lower bound + * @param[in] max_s16 Relu upper bound + * + * @return Quantized values + */ +template +int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32, + int result_fixedpoint_multiplier, + int32_t result_shift, + int16x8_t min_s16, + int16x8_t max_s16) +{ + if(result_shift < 0) + { + in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift)); + in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift)); + + in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + } + else + { + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + // Round to the nearest division by a power-of-two using result_shift_s32 + in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); + in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift); + } + + // Convert S32 to S16 + int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])); + + if(is_bounded_relu) + { + out_s16 = vmaxq_s16(out_s16, min_s16); + out_s16 = vminq_s16(out_s16, max_s16); + } + + return out_s16; +} + +/** Performs final quantization step on single signed 16-bit element + * + * @tparam is_bounded_relu Specified if a fused bounded relu should be applied + * + * @param[in] in_value Input to be quantized. + * @param[in] result_fixedpoint_multiplier Result multiplier parameter + * @param[in] result_shift Result shift parameter + * @param[in] min_s16 Relu lower bound + * @param[in] max_s16 Relu upper bound + * + * @return Quantized values + */ +template +inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier, + int32_t result_shift, int16_t min_s16, int16_t max_s16) +{ + if(result_shift < 0) + { + const int64_t in_64 = static_cast(in_value) * (1 << (-result_shift)) * static_cast(result_fixedpoint_multiplier); + in_value = static_cast((in_64 + (1 << 30)) >> 31); + } + else + { + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + const int64_t in_64 = static_cast(in_value) * static_cast(result_fixedpoint_multiplier); + // Shift value by result_shift_s32 + in_value = rounding_divide_by_pow2(static_cast((in_64 + (1 << 30)) >> 31), result_shift); + } + + // Bound the result + int16_t out_s16 = static_cast(std::max(-32768, std::min(32767, in_value))); + + if(is_bounded_relu) + { + out_s16 = static_cast(std::max(min_s16, std::min(max_s16, out_s16))); + } + + return out_s16; +} + +/** Dequantize a neon vector holding 8 16-bit quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] scale Quantization scale + * + * @return Dequantized values in a neon vector + */ +inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale) +{ + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x2_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale) + } + }; + return vdequantized_input; +} + +/** Quantize a neon vector holding 8 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] scale Quantization scale + * + * @return A neon vector holding the quantized values + */ +inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale) +{ + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + + const int32x4x2_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) +#else //__aarch64__ + vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) +#endif //__aarch64__ + } + }; + return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); +} + +/** Dequantize a neon vector holding 16 16-bit quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale), + } + }; + return vdequantized_input; +} + +/** Quantize a neon vector holding 16 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the quantized values + */ +inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + ARM_COMPUTE_ERROR_ON(scale == 0.f); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), +#endif //__aarch64__ + } + }; + const qsymm16x8x2_t res = + { + vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])), + vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])), + }; + + return res; +} + +/** Multiply a neon vector using quantized multiplier and shift + * + * @param[in] input Input vector to mutiply values to be quantized. + * @param[in] qmul Quantized multipler + * @param[in] shift Left bit shift + * + * @return A neon vector holding the multiplied value + */ +inline int32x4x2_t multiply_by_quantized_multiplier_2row(int32x4x2_t input, int32_t qmul, int32_t shift) +{ + const auto left_shift = shift > 0 ? shift : 0; + const auto right_shift = shift > 0 ? 0 : -shift; + const auto one_shifted = 1 << left_shift; + + int32x4x2_t result; + result.val[0] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[0], one_shifted), qmul), right_shift); + result.val[1] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[1], one_shifted), qmul), right_shift); + + return result; +} + +} // namespace arm_compute +#endif // ARM_COMPUTE_NESYMM_H diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index b15df311cc..621af51f3c 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -26,12 +26,12 @@ #include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NESymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp index 5f5a3e5b37..525e2866f2 100644 --- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp @@ -27,8 +27,8 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Validate.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp index b2700d9cd6..a3da7508ab 100644 --- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp @@ -24,11 +24,11 @@ #include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h" #include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NESymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp index 0ee6d0efcf..c7169d8932 100644 --- a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp @@ -26,12 +26,12 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index 0651cf28e6..50e46474b5 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -25,15 +25,15 @@ #include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" #include diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp index fa8332e803..caaa6c22e8 100644 --- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 Arm Limited. + * Copyright (c) 2016-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,9 +25,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp index 1f0796519b..bc8c77543a 100644 --- a/src/core/NEON/kernels/NEColorConvertKernel.cpp +++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 Arm Limited. + * Copyright (c) 2016-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,12 +29,13 @@ #include "arm_compute/core/IMultiImage.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/MultiImageInfo.h" -#include "arm_compute/core/NEON/NEColorConvertHelper.inl" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/kernels/detail/NEColorConvertHelper.inl" + using namespace arm_compute; NEColorConvertKernel::NEColorConvertKernel() diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp index d439f4314d..f40f1215d3 100644 --- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp +++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,10 +26,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp index 03bc9f0f75..7c65e71727 100644 --- a/src/core/NEON/kernels/NECropKernel.cpp +++ b/src/core/NEON/kernels/NECropKernel.cpp @@ -29,11 +29,11 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/helpers/bit_ops.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp index 6926ec1aac..6066326fec 100644 --- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp @@ -27,13 +27,13 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/wrapper/wrapper.h" #include diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp index 5df3e3ee7d..ee23909bd6 100644 --- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp @@ -27,13 +27,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/NEMath.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/SaturateCast.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/wrapper/wrapper.h" using namespace arm_compute; diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp index d012cbfded..6465848999 100644 --- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp index 62b2531daf..fb47879b17 100644 --- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp @@ -25,11 +25,11 @@ #include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/NEON/wrapper/traits.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp" +#include "src/core/NEON/wrapper/traits.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "support/ToolchainSupport.h" namespace arm_compute diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp index fc0933bcd1..9352088b1f 100644 --- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,12 +27,12 @@ #include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NESymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index 559b67316f..ac1d6aec8f 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -22,7 +22,9 @@ * SOFTWARE. */ #include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h" -#include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" + +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CPP/Validate.h" @@ -30,15 +32,13 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include -#include using namespace arm_compute; using namespace arm_compute::detail; diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp index 8e2b88f5a5..c022fa05a0 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp @@ -28,13 +28,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp index da53a523e6..f862d04b22 100644 --- a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp +++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp @@ -26,9 +26,9 @@ #include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp index 747bd41dc0..40430bdb81 100644 --- a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp +++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp @@ -27,8 +27,8 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Validate.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "support/ToolchainSupport.h" namespace arm_compute diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp index c041b4c56a..de8ba3f484 100644 --- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp +++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,8 +34,8 @@ #include #include -#include "arm_compute/core/NEON/wrapper/traits.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/wrapper/traits.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp index ea2831f88d..d99ff953fc 100644 --- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp +++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,11 +24,11 @@ #include "arm_compute/core/NEON/kernels/NEFFTScaleKernel.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/wrapper/wrapper.h" #include diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp index f0781341d5..e134097f7a 100644 --- a/src/core/NEON/kernels/NEFloorKernel.cpp +++ b/src/core/NEON/kernels/NEFloorKernel.cpp @@ -29,8 +29,8 @@ #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/NEON/NEMath.h" #include "arm_compute/core/Validate.h" +#include "src/core/NEON/NEMath.h" #include diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp index 282b1a6f4d..00d251f79e 100644 --- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp @@ -26,11 +26,11 @@ #include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/wrapper/wrapper.h" #include diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp index e9332b2cb6..8d0d7c26a3 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp @@ -27,13 +27,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp index 458b94b93c..023b798b9a 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp @@ -27,12 +27,12 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp index 44d55652a3..68f16c5fc7 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp @@ -27,13 +27,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NESymm.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/NESymm.h" #include #include diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp index a0a5c5d79f..2ef32c4e81 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp @@ -27,13 +27,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/NEAsymm.h" #include #include diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index a926903598..8fc33dcc82 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -27,13 +27,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/NEAsymm.h" #include #include diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp index 3ac2efc397..1494cd459c 100644 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp @@ -26,8 +26,8 @@ #include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp index 2cac93ab93..bd931469a3 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 Arm Limited. + * Copyright (c) 2016-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,9 +26,9 @@ #include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "src/core/NEON/NEFixedPoint.h" #include diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp index 5bec9d321b..5c5367c9c1 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,7 +29,6 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" @@ -37,6 +36,7 @@ #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/helpers/float_ops.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/NEFixedPoint.h" #include #include diff --git a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp index 8a671bfa23..fc7b819f6a 100644 --- a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp +++ b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp @@ -27,12 +27,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp index f650d97c45..78acbc399d 100644 --- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp @@ -28,12 +28,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp index dbcfda2184..d99def53ba 100644 --- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp +++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp @@ -26,13 +26,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEMath.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp index dd2824b1b6..9eafe18020 100644 --- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,12 +28,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEFixedPoint.h" #include #include diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp index 3fa44804f5..bcce843638 100644 --- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,11 +26,11 @@ #include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp index 6cd0780777..7b888266fb 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp @@ -27,13 +27,13 @@ #include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp index d840bb74ff..1b52117bbe 100644 --- a/src/core/NEON/kernels/NEPadLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,11 +26,11 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp index 4466c24604..c5320b9dbf 100644 --- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp +++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp @@ -24,10 +24,10 @@ #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" #include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NESymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp index 9b5736a9b0..1310ef3521 100644 --- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -28,17 +28,17 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/NEMath.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" #include "support/ToolchainSupport.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include #include @@ -2415,7 +2415,8 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const // Store result wrapper::vstore(reinterpret_cast(output.ptr()) + x_off, (input_qinfo != output_qinfo) ? vrequantize_pooling(wrapper::vgetlow(vres), wrapper::vgethigh(vres), - requant_qinfo) : vres); + requant_qinfo) : + vres); } } diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp index cbfbda71e2..6a038f8f44 100644 --- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp @@ -25,15 +25,16 @@ #include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/NESymm.h" -#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/NESymm.h" + +#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" #include diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp index 26ba4016e1..6d5202d6b5 100644 --- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp @@ -25,12 +25,12 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/CPP/Validate.h" diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp index c8a456a427..7d8fbb1ec1 100644 --- a/src/core/NEON/kernels/NERangeKernel.cpp +++ b/src/core/NEON/kernels/NERangeKernel.cpp @@ -27,10 +27,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Utils.h" diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index 1691f6850c..01534f77b4 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -29,14 +29,14 @@ #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/NEON/NEMath.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/SaturateCast.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/wrapper/wrapper.h" #include namespace arm_compute diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp index 2e6135b44c..0c44a7e0c9 100644 --- a/src/core/NEON/kernels/NEReverseKernel.cpp +++ b/src/core/NEON/kernels/NEReverseKernel.cpp @@ -23,10 +23,10 @@ */ #include "arm_compute/core/NEON/kernels/NEReverseKernel.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp index 1a853527b9..94f5a18102 100644 --- a/src/core/NEON/kernels/NEScaleKernel.cpp +++ b/src/core/NEON/kernels/NEScaleKernel.cpp @@ -26,10 +26,10 @@ #include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/Rounding.h" #include "arm_compute/core/utils/misc/Utility.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/utils/ScaleUtils.h" diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp index 86e8233e0f..286b8a63c8 100644 --- a/src/core/NEON/kernels/NESelectKernel.cpp +++ b/src/core/NEON/kernels/NESelectKernel.cpp @@ -28,10 +28,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "utils/TypePrinter.h" #include @@ -229,7 +229,7 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x); ARM_COMPUTE_RETURN_ERROR_ON(x->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, y); diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp index bc5b0c0696..e71818f213 100644 --- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp +++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp @@ -28,14 +28,14 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/SaturateCast.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp index e2fe88cc0e..ccad92a685 100644 --- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp +++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp index b342cd2047..2667611d2c 100644 --- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp +++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,10 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp index 6b291fdcd6..9e8ec5c106 100644 --- a/src/core/NEON/kernels/NEThresholdKernel.cpp +++ b/src/core/NEON/kernels/NEThresholdKernel.cpp @@ -28,7 +28,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp index 02cf1334ac..69324c1693 100644 --- a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp +++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp @@ -27,11 +27,11 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/wrapper/wrapper.h" #include diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp index 171f5965a5..d12b10c69e 100644 --- a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp +++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp @@ -27,12 +27,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include diff --git a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp index b61633dc30..591aa1e5e6 100644 --- a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp +++ b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp @@ -26,14 +26,15 @@ #include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" + +#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" #include diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h new file mode 100644 index 0000000000..eef1be06eb --- /dev/null +++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H +#define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace detail +{ +/** Dummy activation object */ +template +struct dummy +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector::type; + + /** Construct a dummy activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit dummy(ActivationLayerInfo act_info) + { + ARM_COMPUTE_UNUSED(act_info); + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + ARM_COMPUTE_UNUSED(vval); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + ARM_COMPUTE_UNUSED(val); + } +}; +/** Linear activation object */ +template +struct linear +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + /** Construct a Linear activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit linear(ActivationLayerInfo act_info) + : alpha(act_info.a()), + beta(act_info.b()), + valpha(wrapper::vdup_n(static_cast(alpha), ExactTagType{})), + vbeta(wrapper::vdup_n(static_cast(beta), ExactTagType{})) + { + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vmla(vbeta, vval, valpha); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = alpha * val + beta; + } + + const T alpha; /**< Scalar alpha */ + const T beta; /**< Scalar alpha */ + const ExactType valpha; /**< Vector of alphas. */ + const ExactType vbeta; /**< Vector of betas. */ +}; +/** Square activation object */ +template +struct square +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + /** Construct a Square activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit square(ActivationLayerInfo act_info) + { + ARM_COMPUTE_UNUSED(act_info); + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vmul(vval, vval); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = val * val; + } +}; +/** Logistic activation object */ +template +struct logistic +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + /** Construct a Logistic activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit logistic(ActivationLayerInfo act_info) + : vone(wrapper::vdup_n(static_cast(1), ExactTagType{})) + { + ARM_COMPUTE_UNUSED(act_info); + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vinv(wrapper::vadd(vone, wrapper::vexpq(wrapper::vneg(vval)))); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = 1 / (1 + std::exp(-val)); + } + + /** Vector of ones. */ + const ExactType vone; +}; +/** RELU activation object */ +template +struct relu +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + /** Construct a RELU activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit relu(ActivationLayerInfo act_info) + : vzero(wrapper::vdup_n(static_cast(0), ExactTagType{})) + { + ARM_COMPUTE_UNUSED(act_info); + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vmax(vzero, vval); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = std::max(static_cast(0), val); + } + + /** Vector of zeroes. */ + const ExactType vzero; +}; +/** Bounded RELU activation object */ +template +struct brelu +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + /** Construct a bounded RELU activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit brelu(ActivationLayerInfo act_info) + : alpha(act_info.a()), + vzero(wrapper::vdup_n(static_cast(0), ExactTagType{})), + valpha(wrapper::vdup_n(static_cast(act_info.a()), ExactTagType{})) + { + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vmin(valpha, wrapper::vmax(vzero, vval)); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = std::min(alpha, std::max(static_cast(0), val)); + } + + const T alpha; /** Scalar alpha */ + const ExactType vzero; /** Vector of zeroes. */ + const ExactType valpha; /** Vector of alphas. */ +}; +/** Lower-Upper Bounded RELU activation object */ +template +struct lubrelu +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + /** Construct a lower-upper bounded RELU activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit lubrelu(ActivationLayerInfo act_info) + : alpha(act_info.a()), + beta(act_info.b()), + valpha(wrapper::vdup_n(static_cast(act_info.a()), ExactTagType{})), + vbeta(wrapper::vdup_n(static_cast(act_info.b()), ExactTagType{})) + { + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vmin(valpha, wrapper::vmax(vbeta, vval)); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = std::min(alpha, std::max(beta, val)); + } + + const T alpha; /**< Scalar alpha */ + const T beta; /**< Scalar alpha */ + const ExactType valpha; /** Vector of alphas. */ + const ExactType vbeta; /** Vector of betas. */ +}; +} // namespace detail +} // namespace arm_compute +#endif /* ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H */ diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl new file mode 100644 index 0000000000..ac196d9dbb --- /dev/null +++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl @@ -0,0 +1,1045 @@ +/* + * Copyright (c) 2016-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IMultiImage.h" +#include "arm_compute/core/Utils.h" +#include "src/core/NEON/NEMath.h" + +#include + +namespace +{ +#ifndef DOXYGEN_SKIP_THIS +constexpr float red_coef_bt709 = 1.5748F; +constexpr float green_coef_bt709 = -0.1873f; +constexpr float green_coef2_bt709 = -0.4681f; +constexpr float blue_coef_bt709 = 1.8556f; + +constexpr float rgb2yuv_bt709_kr = 0.2126f; +constexpr float rgb2yuv_bt709_kb = 0.0722f; +// K_g = 1 - K_r - K_b +constexpr float rgb2yuv_bt709_kg = 0.7152f; +// C_u = 1 / (2 * (1 - K_b)) +constexpr float rgb2yuv_bt709_cu = 0.5389f; +// C_v = 1 / (2 * (1 - K_r)) +constexpr float rgb2yuv_bt709_cv = 0.6350f; + +constexpr float rgb2u8_red_coef = 0.2126f; +constexpr float rgb2u8_green_coef = 0.7152f; +constexpr float rgb2u8_blue_coef = 0.0722f; + +inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor, + const float rcoef, const float gcoef, const float bcoef) +{ + float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef); + greyscale = vmlaq_n_f32(greyscale, gcolor, gcoef); + greyscale = vmlaq_n_f32(greyscale, bcolor, bcoef); + return greyscale; +} + +inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out) +{ + float32x4x4_t out_float32; + + //Conversion from 3(RGB) 4 uint8s to 3(RGB) 4 floats + const float32x4x4_t r_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[0]); + const float32x4x4_t g_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[1]); + const float32x4x4_t b_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[2]); + + //New grayscale image = ( (RED_COEFF * R) + (GREEN_COEFF * G) + (BLUE_COEFF * B) ) + //Computation of 1(Greyscale) 4 uint8 using 3(RGB) 4 uint8s float + out_float32.val[0] = rgb_to_greyscale_calculation(r_float32.val[0], g_float32.val[0], b_float32.val[0], + rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); + + out_float32.val[1] = rgb_to_greyscale_calculation(r_float32.val[1], g_float32.val[1], b_float32.val[1], + rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); + + out_float32.val[2] = rgb_to_greyscale_calculation(r_float32.val[2], g_float32.val[2], b_float32.val[2], + rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); + + out_float32.val[3] = rgb_to_greyscale_calculation(r_float32.val[3], g_float32.val[3], b_float32.val[3], + rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); + + //Conversion from 1(Greyscale) 4 floats to 1(Greyscale) 4 uint8s + arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out); +} + +inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec, + float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec) +{ + /* + Y'= 0.2126*R' + 0.7152*G' + 0.0722*B' + U'=-0.1146*R' - 0.3854*G' + 0.5000*B' + V'= 0.5000*R' - 0.4542*G' - 0.0458*B' + */ + const auto c128 = vdupq_n_f32(128.f); + + // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b + yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr); + yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg); + yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb); + + // U = (B - Y) / (2 * (1 - K_b)) + uvec = vsubq_f32(bvec, yvec); + uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu); + + // V = (R - Y) / (2 * (1 - K_r)) + vvec = vsubq_f32(rvec, yvec); + vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv); +} + +inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val, + float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha) +{ + float32x4x3_t rgb1, rgb2; + + // Compute: cb - 128 and cr - 128; + const auto c128 = vdupq_n_f32(128.f); + uvec_val = vsubq_f32(uvec_val, c128); + vvec_val = vsubq_f32(vvec_val, c128); + + // Compute: + // r = 0.0000f*f_u + 1.5748f*f_v; + // g = 0.1873f*f_u - 0.4681f*f_v; + // b = 1.8556f*f_u + 0.0000f*f_v; + const auto red = vmulq_n_f32(vvec_val, red_coef_bt709); + const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709); + const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), + vmulq_n_f32(vvec_val, green_coef2_bt709)); + + // Compute the final r,g,b values using y1 for the first texel and y2 for the second one. + // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t + // and written back to memory using vst3 instruction + + rgb1.val[0] = vaddq_f32(yvec_val, red); + rgb1.val[1] = vaddq_f32(yvec_val, green); + rgb1.val[2] = vaddq_f32(yvec_val, blue); + + rgb2.val[0] = vaddq_f32(yyvec_val, red); + rgb2.val[1] = vaddq_f32(yyvec_val, green); + rgb2.val[2] = vaddq_f32(yyvec_val, blue); + + uint8x8x3_t u8_rgb; + arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb); + + if(!alpha) + { + vst3_lane_u8(&output_ptr[0], u8_rgb, 0); + vst3_lane_u8(&output_ptr[3], u8_rgb, 4); + vst3_lane_u8(&output_ptr[6], u8_rgb, 1); + vst3_lane_u8(&output_ptr[9], u8_rgb, 5); + vst3_lane_u8(&output_ptr[12], u8_rgb, 2); + vst3_lane_u8(&output_ptr[15], u8_rgb, 6); + vst3_lane_u8(&output_ptr[18], u8_rgb, 3); + vst3_lane_u8(&output_ptr[21], u8_rgb, 7); + } + else + { + uint8x8x4_t u8_rgba; + u8_rgba.val[0] = u8_rgb.val[0]; + u8_rgba.val[1] = u8_rgb.val[1]; + u8_rgba.val[2] = u8_rgb.val[2]; + u8_rgba.val[3] = vdup_n_u8(255); + vst4_lane_u8(&output_ptr[0], u8_rgba, 0); + vst4_lane_u8(&output_ptr[4], u8_rgba, 4); + vst4_lane_u8(&output_ptr[8], u8_rgba, 1); + vst4_lane_u8(&output_ptr[12], u8_rgba, 5); + vst4_lane_u8(&output_ptr[16], u8_rgba, 2); + vst4_lane_u8(&output_ptr[20], u8_rgba, 6); + vst4_lane_u8(&output_ptr[24], u8_rgba, 3); + vst4_lane_u8(&output_ptr[28], u8_rgba, 7); + } +} + +inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha) +{ + uint8x16x3_t rgb; + + if(alpha) + { + const auto tmp = vld4q_u8(ptr); + rgb.val[0] = tmp.val[0]; + rgb.val[1] = tmp.val[1]; + rgb.val[2] = tmp.val[2]; + } + else + { + rgb = vld3q_u8(ptr); + } + + return rgb; +} + +inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom) +{ + // Convert the uint8x16_t to float32x4x4_t + const float32x4x4_t frvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[0]); + const float32x4x4_t fgvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[1]); + const float32x4x4_t fbvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[2]); + + const float32x4x4_t frvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[0]); + const float32x4x4_t fgvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[1]); + const float32x4x4_t fbvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[2]); + + float32x4x4_t fyvec_top, fuvec_top, fvvec_top; + float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom; + + for(auto i = 0; i < 4; ++i) + { + rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], + fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]); + rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], + fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]); + } + + arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]); + arm_compute::convert_float32x4x4_to_uint8x16(fuvec_top, vec_top.val[1]); + arm_compute::convert_float32x4x4_to_uint8x16(fvvec_top, vec_top.val[2]); + arm_compute::convert_float32x4x4_to_uint8x16(fyvec_bottom, vec_bottom.val[0]); + arm_compute::convert_float32x4x4_to_uint8x16(fuvec_bottom, vec_bottom.val[1]); + arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]); +} + +inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, + unsigned char *const __restrict out_uv) +{ + uint8x16x3_t vec_top, vec_bottom; + vec_top.val[0] = rvec_top; + vec_top.val[1] = gvec_top; + vec_top.val[2] = bvec_top; + vec_bottom.val[0] = rvec_bottom; + vec_bottom.val[1] = gvec_bottom; + vec_bottom.val[2] = bvec_bottom; + + rgb_to_yuv_conversion(vec_top, vec_bottom); + + vst1q_u8(out_y_top, vec_top.val[0]); + vst1q_u8(out_y_bottom, vec_bottom.val[0]); + + const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]); + const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]); + const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]); + const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]); + + uint8x8x2_t uvvec; + uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp)); + uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp)); + + vst2_u8(out_uv, uvvec); +} + +inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, + unsigned char *const __restrict out_u, + unsigned char *const __restrict out_v) +{ + uint8x16x3_t vec_top, vec_bottom; + vec_top.val[0] = rvec_top; + vec_top.val[1] = gvec_top; + vec_top.val[2] = bvec_top; + vec_bottom.val[0] = rvec_bottom; + vec_bottom.val[1] = gvec_bottom; + vec_bottom.val[2] = bvec_bottom; + + rgb_to_yuv_conversion(vec_top, vec_bottom); + + vst1q_u8(out_y_top, vec_top.val[0]); + vst1q_u8(out_y_bottom, vec_bottom.val[0]); + + const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]); + const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]); + const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), + vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1])); + + vst1_u8(out_u, vget_low_u8(uvvec)); + vst1_u8(out_v, vget_high_u8(uvvec)); +} + +inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec, + unsigned char *const __restrict out_y, + unsigned char *const __restrict out_u, + unsigned char *const __restrict out_v) +{ + // Convert the uint8x16_t to float32x4x4_t + const float32x4x4_t frvec = arm_compute::convert_uint8x16_to_float32x4x4(rvec); + const float32x4x4_t fgvec = arm_compute::convert_uint8x16_to_float32x4x4(gvec); + const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec); + + float32x4x4_t fyvec, fuvec, fvvec; + for(auto i = 0; i < 4; ++i) + { + rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], + fyvec.val[i], fuvec.val[i], fvvec.val[i]); + } + + uint8x16_t yvec, uvec, vvec; + arm_compute::convert_float32x4x4_to_uint8x16(fyvec, yvec); + arm_compute::convert_float32x4x4_to_uint8x16(fuvec, uvec); + arm_compute::convert_float32x4x4_to_uint8x16(fvvec, vvec); + + vst1q_u8(out_y, yvec); + vst1q_u8(out_u, uvec); + vst1q_u8(out_v, vvec); +} +#endif /* DOXYGEN_SKIP_THIS */ +} + +namespace arm_compute +{ +/** Convert RGB to RGBX. + * + * @param[in] input Input RGB data buffer. + * @param[out] output Output RGBX buffer. + * @param[in] win Window for iterating the buffers. + * + */ +void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + Iterator in(input_ptr, win); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta1 = vld3q_u8(in.ptr()); + uint8x16x4_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + ta2.val[3] = vdupq_n_u8(255); + vst4q_u8(out.ptr(), ta2); + }, + in, out); +} + +/** Convert RGB to U8. + * + * @param[in] input Input RGB data buffer. + * @param[out] output Output U8 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + Iterator in(input_ptr, win); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta1 = vld3q_u8(in.ptr()); + uint8x16_t ta2; + rgb_to_u8_conversion(ta1, ta2); + vst1q_u8(out.ptr(), ta2); + }, + in, out); +} + +/** Convert RGBX to RGB. + * + * @param[in] input Input RGBX data buffer. + * @param[out] output Output RGB buffer. + * @param[in] win Window for iterating the buffers. + * + */ +void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + Iterator in(input_ptr, win); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta1 = vld4q_u8(in.ptr()); + uint8x16x3_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + vst3q_u8(out.ptr(), ta2); + }, + in, out); +} + +/** Convert YUYV to RGB. + * + * @param[in] input Input YUYV data buffer. + * @param[out] output Output RGB buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template +void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto element_size = alpha ? 32 : 24; + constexpr auto shift = yuyv ? 0 : 1; + + Iterator in(input_ptr, win); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta = vld4q_u8(in.ptr()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + // Convert the uint8x16x4_t to float32x4x4_t + const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]); + const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]); + const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]); + const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]); + + yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); + }, + in, out); +} + +/** Convert NV12 to RGB. + * + * @param[in] input Input NV12 data buffer. + * @param[out] output Output RGB buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template +void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto element_size = alpha ? 32 : 24; + const auto out_stride = output_ptr->info()->strides_in_bytes().y(); + constexpr auto shift = uv ? 0 : 1; + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_uv(input_ptr->plane(1), win_uv); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]); + + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_uv, out); +} + +/** Convert IYUV to RGB. + * + * @param[in] input Input IYUV data buffer. + * @param[out] output Output RGB buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template +void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto element_size = alpha ? 32 : 24; + const auto out_stride = output_ptr->info()->strides_in_bytes().y(); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_u(input_ptr->plane(1), win_uv); + Iterator in_v(input_ptr->plane(2), win_uv); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto *y_top_ptr = in_y.ptr(); + const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y(); + const auto *u_ptr = in_u.ptr(); + const auto *v_ptr = in_v.ptr(); + + // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation +#if defined(__arch64__) + const auto ta0_y_top = vld1q_u8(y_top_ptr); + const auto ta1_y_top = vld1q_u8(y_top_ptr + 16); + const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr); + const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16); + const auto ta_u = vld1q_u8(u_ptr); + const auto ta_v = vld1q_u8(v_ptr); + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top)); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top)); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom)); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom)); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); +#else /* defined(__arch64__) */ + const auto ta_y_top = vld2q_u8(y_top_ptr); + const auto ta_y_bottom = vld2q_u8(y_bottom_ptr); + const auto ta_u = vld1q_u8(u_ptr); + const auto ta_v = vld1q_u8(v_ptr); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u.val[0] = U0 U2 U4 U6 ... + //ta_v.val[0] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); +#endif /* defined(__arch64__) */ + + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_u, in_v, out); +} + +/** Convert YUYV to NV12. + * + * @param[in] input Input YUYV data buffer. + * @param[out] output Output NV12 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template +void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto shift = yuyv ? 0 : 1; + + // NV12's UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_uv(output_ptr->plane(1), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16x2_t uvvec; + uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst2q_u8(out_uv.ptr(), uvvec); + }, + in, out_y, out_uv); +} + +/** Convert IYUV to NV12. + * + * @param[in] input Input IYUV data buffer. + * @param[out] output Output NV12 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_u(input_ptr->plane(1), win_uv); + Iterator in_v(input_ptr->plane(2), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_uv(output_ptr->plane(1), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + uint8x16x2_t ta_uv; + ta_uv.val[0] = vld1q_u8(in_u.ptr()); + ta_uv.val[1] = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst2q_u8(out_uv.ptr(), ta_uv); + }, + in_y, in_u, in_v, out_y, out_uv); +} + +/** Convert NV12 to IYUV. + * + * @param[in] input Input NV12 data buffer. + * @param[out] output Output IYUV buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template +void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto shift = uv ? 0 : 1; + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_uv(input_ptr->plane(1), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win_uv); + Iterator out_v(output_ptr->plane(2), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]); + vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]); + }, + in_y, in_uv, out_y, out_u, out_v); +} + +/** Convert YUYV to IYUV. + * + * @param[in] input Input YUYV data buffer. + * @param[out] output Output IYUV buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template +void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto shift = yuyv ? 0 : 1; + + // Destination's UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win_uv); + Iterator out_v(output_ptr->plane(2), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16_t uvec; + uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + vst1q_u8(out_u.ptr(), uvec); + + uint8x16_t vvec; + vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst1q_u8(out_v.ptr(), vvec); + }, + in, out_y, out_u, out_v); +} + +/** Convert NV12 to YUV4. + * + * @param[in] input Input NV12 data buffer. + * @param[out] output Output YUV4 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template +void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto shift = uv ? 0 : 1; + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_uv(input_ptr->plane(1), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win); + Iterator out_v(output_ptr->plane(2), win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_uv.val[0 + shift]; + uvec.val[1] = ta_uv.val[0 + shift]; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_uv.val[1 - shift]; + vvec.val[1] = ta_uv.val[1 - shift]; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_uv, out_y, out_u, out_v); +} + +/** Convert IYUV to YUV4. + * + * @param[in] input Input IYUV data buffer. + * @param[out] output Output YUV4 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_u(input_ptr->plane(1), win_uv); + Iterator in_v(input_ptr->plane(2), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win); + Iterator out_v(output_ptr->plane(2), win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_u = vld1q_u8(in_u.ptr()); + const auto ta_v = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u = U0 U2 U4 U6 ... + //ta_v = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_u; + uvec.val[1] = ta_u; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_v; + vvec.val[1] = ta_v; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_u, in_v, out_y, out_u, out_v); +} + +/** Convert RGB to NV12. + * + * @param[in] input Input RGB data buffer. + * @param[out] output Output NV12 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template +void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_uv(output_ptr->plane(1), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], + ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], + out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), + out_uv.ptr()); + }, + in, out_y, out_uv); +} + +/** Convert RGB to IYUV. + * + * @param[in] input Input RGB data buffer. + * @param[out] output Output IYUV buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template +void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win_uv); + Iterator out_v(output_ptr->plane(2), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], + ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], + out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), + out_u.ptr(), out_v.ptr()); + }, + in, out_y, out_u, out_v); +} + +/** Convert RGB to YUV4. + * + * @param[in] input Input RGB data buffer. + * @param[out] output Output YUV4 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template +void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win); + Iterator out_v(output_ptr->plane(2), win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_rgb = load_rgb(in.ptr(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], + out_y.ptr(), out_u.ptr(), out_v.ptr()); + }, + in, out_y, out_u, out_v); +} +} // namespace arm_compute diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h new file mode 100644 index 0000000000..96defbc9c9 --- /dev/null +++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H +#define ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H + +#include + +namespace arm_compute +{ +namespace detail +{ +inline float32x4x3_t load_matrix_row(const float *ptr) +{ + const float32x4x3_t r = + { + { + vld1q_dup_f32(ptr), + vld1q_dup_f32(1 + ptr), + vld1q_dup_f32(2 + ptr) + } + }; + return r; +} + +template +float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2); + +template <> +inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +{ + const float32x4x3_t vtop = + { + { + vld1q_f32(in_top), + vld1q_f32(in_top + 4), + vld1q_f32(in_top + 8) + } + }; + const float32x4x3_t vmid = + { + { + vld1q_f32(in_mid), + vld1q_f32(in_mid + 4), + vld1q_f32(in_mid + 8) + } + }; + const float32x4x3_t vlow = + { + { + vld1q_f32(in_low), + vld1q_f32(in_low + 4), + vld1q_f32(in_low + 8) + } + }; + float32x4x2_t out = + { + { + vmulq_f32(vtop.val[0], m0.val[0]), + vmulq_f32(vtop.val[1], m0.val[0]) + } + }; + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); + return out; +} + +template <> +inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +{ + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); + return out; +} + +template <> +inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +{ + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); + return out; +} + +template +void store_results(float *buffer, const float32x4x2_t &values); + +template <> +void store_results<1>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); + vst1q_f32(buffer + 4, values.val[1]); +} + +template <> +void store_results<2>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); +} + +template <> +void store_results<3>(float *buffer, const float32x4x2_t &values) +{ + vst1_f32(buffer, vget_low_f32(values.val[0])); +} + +template +int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration); + +template <> +int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration; +} + +template <> +int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration << 1; +} + +template <> +int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration * 3; +} +} +} // namespace arm_compute +#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */ \ No newline at end of file diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h new file mode 100644 index 0000000000..d7ee70a1cd --- /dev/null +++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h @@ -0,0 +1,965 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H +#define ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/utils/misc/Requires.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include + +namespace arm_compute +{ +namespace detail +{ +/** Loads a 3x3 matrix as a row (float). + * + * @param[in] ptr Pointer to a float 3x3 matrix. + * @param[in] weights_offset (Optional) Weights quantization offset. + * + * @return The loaded matrix. + */ +inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) +{ + ARM_COMPUTE_UNUSED(weights_offset); + const float32x4x3_t r = + { + { + vld1q_dup_f32(ptr), + vld1q_dup_f32(1 + ptr), + vld1q_dup_f32(2 + ptr) + } + }; + return r; +} + +/** Loads a 3x3 matrix as a row (uint8_t/int8_t). + * + * @param[in] ptr Pointer to a uint8_t/int8_t 3x3 matrix. + * @param[in] weights_offset (Optional) Weights quantization offset. + * + * @return The loaded matrix. + */ +template < typename T, REQUIRES_TA(std::is_same::value || std::is_same::value) > +inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0) +{ + const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset); + + /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: + r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ + int32x4x3_t r = + { + { + vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2))) + } + }; + return r; +} + +/** Stores a float32x4x2_t array into a memory location. + * + * @param[in] buffer Pointer to the memory location where the values will be stored. + * @param[in] values Values that will be stored. + * + */ +template +void store_results(float *buffer, const float32x4x2_t &values); + +template <> +inline void store_results<1>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); + vst1q_f32(buffer + 4, values.val[1]); +} + +template <> +inline void store_results<2>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); +} + +template <> +inline void store_results<3>(float *buffer, const float32x4x2_t &values) +{ + vst1_f32(buffer, vget_low_f32(values.val[0])); +} + +/** Stores a uint32_t array into a memory location. + * + * @param[in] buffer Pointer to the memory location where the values will be stored. + * @param[in] values Values that will be stored. + * + */ +template +void store_results(int32_t *buffer, const int32x4x2_t &values); + +template <> +inline void store_results<1>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1q_s32(buffer, values.val[0]); + vst1q_s32(buffer + 4, values.val[1]); +} + +template <> +inline void store_results<2>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1q_s32(buffer, values.val[0]); +} + +template <> +inline void store_results<3>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1_s32(buffer, vget_low_s32(values.val[0])); +} + +template +inline void accumulate_results(float *buffer, const float32x4x2_t &values); + +template <> +inline void accumulate_results<1>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); + vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1])); +} + +template <> +inline void accumulate_results<2>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); +} + +template <> +inline void accumulate_results<3>(float *buffer, const float32x4x2_t &values) +{ + vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0]))); +} + +template +void accumulate_results(int32_t *buffer, const int32x4x2_t &values); + +template <> +inline void accumulate_results<1>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0])); + vst1q_s32(buffer + 4, vaddq_s32(vld1q_s32(buffer + 4), values.val[1])); +} + +template <> +inline void accumulate_results<2>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0])); +} + +template <> +inline void accumulate_results<3>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1_s32(buffer, vadd_s32(vld1_s32(buffer), vget_low_s32(values.val[0]))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +/** Stores a float16x8x2_t array into a memory location. + * + * @param[in] buffer Pointer to the memory location where the values will be stored. + * @param[in] values Values that will be stored. + * + */ +template +void store_results(float16_t *buffer, const float16x8x2_t &values); + +template <> +inline void store_results<1>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1q_f16(buffer, values.val[0]); + vst1q_f16(buffer + 8, values.val[1]); +} + +template <> +inline void store_results<2>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1q_f16(buffer, values.val[0]); +} + +template <> +inline void store_results<3>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1_f16(buffer, vget_low_f16(values.val[0])); +} + +template +inline void accumulate_results(float16_t *buffer, const float16x8x2_t &values); + +template <> +inline void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); + vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1])); +} + +template <> +inline void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); +} + +template <> +inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +/** Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + const size_t dilation_x, int input_offset) +{ + ARM_COMPUTE_UNUSED(input_offset); + + const float32x4x3_t vtop = + { + { + vld1q_f32(in_top), + vld1q_f32(in_top + dilation_x), + vld1q_f32(in_top + 2 * dilation_x) + } + }; + const float32x4x3_t vmid = + { + { + vld1q_f32(in_mid), + vld1q_f32(in_mid + dilation_x), + vld1q_f32(in_mid + 2 * dilation_x) + } + }; + const float32x4x3_t vlow = + { + { + vld1q_f32(in_low), + vld1q_f32(in_low + dilation_x), + vld1q_f32(in_low + 2 * dilation_x) + } + }; + float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]); + out = vmlaq_f32(out, vtop.val[1], m0.val[1]); + out = vmlaq_f32(out, vtop.val[2], m0.val[2]); + + out = vmlaq_f32(out, vmid.val[0], m1.val[0]); + out = vmlaq_f32(out, vmid.val[1], m1.val[1]); + out = vmlaq_f32(out, vmid.val[2], m1.val[2]); + + out = vmlaq_f32(out, vlow.val[0], m2.val[0]); + out = vmlaq_f32(out, vlow.val[1], m2.val[1]); + out = vmlaq_f32(out, vlow.val[2], m2.val[2]); + + return out; +} + +/** Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + const size_t dilation_x, unsigned int stridex, int input_offset = 0) +{ + ARM_COMPUTE_ERROR_ON(stridex > 3); + float32x4x2_t out = + { + { + single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) + } + }; + + if(stridex == 2) + { + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); + } + else if(stridex == 3) + { + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); + } + + return out; +} + +/** Perform a convolve3x3 on float32. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[out] out_ptr Pointer to the output. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +template +void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + unsigned int stridex, int input_offset = 0); + +template +inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + unsigned int stridex, int input_offset) +{ + ARM_COMPUTE_UNUSED(input_offset); + ARM_COMPUTE_ERROR_ON(stridex > 3); + + float32x4x2_t out = + { + { + vdupq_n_f32(0.f), + vdupq_n_f32(0.f) + } + }; + if(stridex == 2) + { + const float32x4x2_t vtop = vld2q_f32(in_top); + const float32x4x2_t vmid = vld2q_f32(in_mid); + const float32x4x2_t vlow = vld2q_f32(in_low); + const float32x4_t vtop_end = vld1q_f32(in_top + 8); + const float32x4_t vmid_end = vld1q_f32(in_mid + 8); + const float32x4_t vlow_end = vld1q_f32(in_low + 8); + + out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); + + out.val[0] = vmlaq_f32(out.val[0], vtop.val[1], m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop_end, 1), m0.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vmid.val[1], m1.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid_end, 1), m1.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vlow.val[1], m2.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow_end, 1), m2.val[2]); + + accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); + } + else + { + const float32x4x3_t vtop = + { + { + vld1q_f32(in_top), + vld1q_f32(in_top + 4), + vld1q_f32(in_top + 8) + } + }; + const float32x4x3_t vmid = + { + { + vld1q_f32(in_mid), + vld1q_f32(in_mid + 4), + vld1q_f32(in_mid + 8) + } + }; + const float32x4x3_t vlow = + { + { + vld1q_f32(in_low), + vld1q_f32(in_low + 4), + vld1q_f32(in_low + 8) + } + }; + out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); + out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]); + + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); + + if(stridex == 3) + { + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); + accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); + } + else + { + accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); + } + } +} + +/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset Input quantization offset. + * + */ +template < typename T, REQUIRES_TA(std::is_same::value || std::is_same::value) > +inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, + const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + size_t dilation_x, int32_t input_offset) +{ + using VectorType = typename std::conditional::value, uint8x8x3_t, int8x8x3_t>::type; + using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t; + + const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); + + const VectorType vtop = + { + { + wrapper::vload(in_top), + wrapper::vload(in_top + dilation_x), + wrapper::vload(in_top + 2 * dilation_x) + } + }; + const VectorType vmid = + { + { + wrapper::vload(in_mid), + wrapper::vload(in_mid + dilation_x), + wrapper::vload(in_mid + 2 * dilation_x) + } + }; + const VectorType vlow = + { + { + wrapper::vload(in_low), + wrapper::vload(in_low + dilation_x), + wrapper::vload(in_low + 2 * dilation_x) + } + }; + + const int32x4x3_t vtop_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))), + } + }; + const int32x4x3_t vmid_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))), + } + }; + const int32x4x3_t vlow_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))), + } + }; + + int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]); + out = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]); + out = wrapper::vmla(out, vtop_s32.val[2], m0.val[2]); + + out = wrapper::vmla(out, vmid_s32.val[0], m1.val[0]); + out = wrapper::vmla(out, vmid_s32.val[1], m1.val[1]); + out = wrapper::vmla(out, vmid_s32.val[2], m1.val[2]); + + out = wrapper::vmla(out, vlow_s32.val[0], m2.val[0]); + out = wrapper::vmla(out, vlow_s32.val[1], m2.val[1]); + out = wrapper::vmla(out, vlow_s32.val[2], m2.val[2]); + + return out; +} + +/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset Input quantization offset. + * + */ +template < typename T, REQUIRES_TA(std::is_same::value || std::is_same::value) > +inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + const size_t dilation_x, unsigned int stridex, int input_offset) +{ + ARM_COMPUTE_ERROR_ON(stridex > 3); + int32x4x2_t out = + { + { + single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) + } + }; + + if(stridex == 2) + { + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3); + } + else if(stridex == 3) + { + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); + } + return out; +} + +/** Perform a convolve3x3 on 8-bit elements + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[out] out_ptr Pointer to the output. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset Input quantization offset. + * + */ +template < bool accumulate, typename T1, typename T2, REQUIRES_TA(std::is_same::value || std::is_same::value) > +void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr, + const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + unsigned int stridex, int32_t input_offset) +{ + ARM_COMPUTE_ERROR_ON(stridex > 3); + using VectorType = typename std::conditional::value, uint8x8x2_t, int8x8x2_t>::type; + using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t; + + const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); + + const VectorType vtop = + { + { + wrapper::vload(in_top), + wrapper::vload(in_top + 8) + } + }; + const VectorType vmid = + { + { + wrapper::vload(in_mid), + wrapper::vload(in_mid + 8) + } + }; + const VectorType vlow = + { + { + wrapper::vload(in_low), + wrapper::vload(in_low + 8) + } + }; + + const int32x4x3_t vtop_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), + } + }; + const int32x4x3_t vmid_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), + } + }; + const int32x4x3_t vlow_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), + } + }; + + int32x4x2_t out + { + { + wrapper::vdup_n(static_cast(0), OutputTagType{}), + wrapper::vdup_n(static_cast(0), OutputTagType{}), + } + }; + + // 0 + out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]); + + out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]); + + out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]); + + // 1 + out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]); + + out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]); + + out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]); + + if(stridex == 1) + { + accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); + } + else if(stridex == 2) + { + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3); + + accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); + } + else if(stridex == 3) + { + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); + accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); + } +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +/** Loads a 3x3 matrix as a row (float16_t). + * + * @param[in] ptr Pointer to a float 3x3 matrix. + * + * @return The loaded matrix. + */ +inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = 0) +{ + ARM_COMPUTE_UNUSED(weights_offset); + /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: + r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ + const float16x8x3_t r = + { + { + vld1q_dup_f16(ptr), + vld1q_dup_f16(1 + ptr), + vld1q_dup_f16(2 + ptr) + } + }; + return r; +} + +/** Perform a 3x3 convolution for 8 consecutive elements on float16 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset (Optional)Input quantization offset. + * + */ +inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, + const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + const size_t dilation_x, int input_offset = 0) +{ + ARM_COMPUTE_UNUSED(input_offset); + const float16x8x3_t vtop = + { + { + vld1q_f16(in_top), + vld1q_f16(in_top + dilation_x), + vld1q_f16(in_top + 2 * dilation_x) + } + }; + const float16x8x3_t vmid = + { + { + vld1q_f16(in_mid), + vld1q_f16(in_mid + dilation_x), + vld1q_f16(in_mid + 2 * dilation_x) + } + }; + const float16x8x3_t vlow = + { + { + vld1q_f16(in_low), + vld1q_f16(in_low + dilation_x), + vld1q_f16(in_low + 2 * dilation_x) + } + }; + float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]); + out = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1])); + out = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2])); + + out = vaddq_f16(out, vmulq_f16(vmid.val[0], m1.val[0])); + out = vaddq_f16(out, vmulq_f16(vmid.val[1], m1.val[1])); + out = vaddq_f16(out, vmulq_f16(vmid.val[2], m1.val[2])); + + out = vaddq_f16(out, vmulq_f16(vlow.val[0], m2.val[0])); + out = vaddq_f16(out, vmulq_f16(vlow.val[1], m2.val[1])); + out = vaddq_f16(out, vmulq_f16(vlow.val[2], m2.val[2])); + + return out; +} + +/** Perform a 3x3 convolution for 16 consecutive elements on float16 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, + const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + const size_t dilation_x, unsigned int stridex, int input_offset = 0) +{ + float16x8x2_t out = + { + { + single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset) + } + }; + + if(stridex == 2) + { + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 3); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 4); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 5); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7); + } + else if(stridex == 3) + { + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3); + } + + return out; +} + +/** Perform a convolve3x3 on float16. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[out] out_ptr Pointer to the output. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +template +inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr, + const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + unsigned int stridex, int input_offset = 0) +{ + ARM_COMPUTE_UNUSED(input_offset); + + float16x8x2_t out = + { + { + vdupq_n_f16(0), + vdupq_n_f16(0) + } + }; + if(stridex == 2) + { + const float16x8x2_t vtop = vld2q_f16(in_top); + const float16x8x2_t vmid = vld2q_f16(in_mid); + const float16x8x2_t vlow = vld2q_f16(in_low); + const float16x8_t vtop_end = vld1q_f16(in_top + 16); + const float16x8_t vmid_end = vld1q_f16(in_mid + 16); + const float16x8_t vlow_end = vld1q_f16(in_low + 16); + + out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); + + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vtop.val[1], m0.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop_end, 1), m0.val[2])); + + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[1], m1.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid_end, 1), m1.val[2])); + + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[1], m2.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow_end, 1), m2.val[2])); + + accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); + } + else + { + const float16x8x3_t vtop = + { + { + vld1q_f16(in_top), + vld1q_f16(in_top + 8), + vld1q_f16(in_top + 16) + } + }; + const float16x8x3_t vmid = + { + { + vld1q_f16(in_mid), + vld1q_f16(in_mid + 8), + vld1q_f16(in_mid + 16) + } + }; + const float16x8x3_t vlow = + { + { + vld1q_f16(in_low), + vld1q_f16(in_low + 8), + vld1q_f16(in_low + 16) + } + }; + out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); + out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]); + + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2])); + + if(stridex == 3) + { + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3); + + accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); + } + else + { + accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); + } + } +} +#endif /** __ARM_FEATURE_FP16_VECTOR_ARITHMETIC **/ + +/** Get the number of elements processed on 3x3 convolution. + * + * @param[in] num_elems_written_per_iteration Number of elements written per iteration on 3x3 convolution. + * @param[in] stridex Stride value in elements across x. + * + * @return The number of elements processed. + */ +inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex) +{ + switch(stridex) + { + case 1: + return num_elems_written_per_iteration; + case 2: + return num_elems_written_per_iteration << 1; + case 3: + return num_elems_written_per_iteration * 3; + default: + ARM_COMPUTE_ERROR("stridex not supported"); + return 0; + } +} +} +} // namespace arm_compute +#endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */ diff --git a/src/core/NEON/wrapper/intrinsics/abs.h b/src/core/NEON/wrapper/intrinsics/abs.h new file mode 100644 index 0000000000..0d49a9ebf1 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/abs.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_ABS_H +#define ARM_COMPUTE_WRAPPER_ABS_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VABS_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vabs(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +#define VQABS_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vqabs(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +// Absolute: vabs{q}_. Vd[i] = |Va[i]| +VABS_IMPL(int8x8_t, int8x8_t, vabs, s8) +VABS_IMPL(int16x4_t, int16x4_t, vabs, s16) +VABS_IMPL(int32x2_t, int32x2_t, vabs, s32) +VABS_IMPL(float32x2_t, float32x2_t, vabs, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VABS_IMPL(float16x4_t, float16x4_t, vabs, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VABS_IMPL(int8x16_t, int8x16_t, vabsq, s8) +VABS_IMPL(int16x8_t, int16x8_t, vabsq, s16) +VABS_IMPL(int32x4_t, int32x4_t, vabsq, s32) +VABS_IMPL(float32x4_t, float32x4_t, vabsq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VABS_IMPL(float16x8_t, float16x8_t, vabsq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +// Saturating absolute: vqabs{q}_. Vd[i] = sat(|Va[i]|) +VQABS_IMPL(int8x8_t, int8x8_t, vqabs, s8) +VQABS_IMPL(int16x4_t, int16x4_t, vqabs, s16) +VQABS_IMPL(int32x2_t, int32x2_t, vqabs, s32) + +VQABS_IMPL(int8x16_t, int8x16_t, vqabsq, s8) +VQABS_IMPL(int16x8_t, int16x8_t, vqabsq, s16) +VQABS_IMPL(int32x4_t, int32x4_t, vqabsq, s32) + +#undef VABS_IMPL +#undef VQABS_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_ABS_H */ diff --git a/src/core/NEON/wrapper/intrinsics/add.h b/src/core/NEON/wrapper/intrinsics/add.h new file mode 100644 index 0000000000..6134d75b29 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/add.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_ADD_H +#define ARM_COMPUTE_WRAPPER_ADD_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VADD_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vadd(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VADD_IMPL(uint8x8_t, uint8x8_t, vadd, u8) +VADD_IMPL(int8x8_t, int8x8_t, vadd, s8) +VADD_IMPL(uint16x4_t, uint16x4_t, vadd, u16) +VADD_IMPL(int16x4_t, int16x4_t, vadd, s16) +VADD_IMPL(uint32x2_t, uint32x2_t, vadd, u32) +VADD_IMPL(int32x2_t, int32x2_t, vadd, s32) +VADD_IMPL(uint64x1_t, uint64x1_t, vadd, u64) +VADD_IMPL(int64x1_t, int64x1_t, vadd, s64) +VADD_IMPL(float32x2_t, float32x2_t, vadd, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VADD_IMPL(float16x4_t, float16x4_t, vadd, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VADD_IMPL(uint8x16_t, uint8x16_t, vaddq, u8) +VADD_IMPL(int8x16_t, int8x16_t, vaddq, s8) +VADD_IMPL(uint16x8_t, uint16x8_t, vaddq, u16) +VADD_IMPL(int16x8_t, int16x8_t, vaddq, s16) +VADD_IMPL(uint32x4_t, uint32x4_t, vaddq, u32) +VADD_IMPL(int32x4_t, int32x4_t, vaddq, s32) +VADD_IMPL(uint64x2_t, uint64x2_t, vaddq, u64) +VADD_IMPL(int64x2_t, int64x2_t, vaddq, s64) +VADD_IMPL(float32x4_t, float32x4_t, vaddq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VADD_IMPL(float16x8_t, float16x8_t, vaddq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#undef VADD_IMPL + +// VQADD: Vector saturating add (No notion of saturation for floating point) +#define VQADD_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vqadd(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VQADD_IMPL(uint8x8_t, uint8x8_t, vqadd, u8) +VQADD_IMPL(int8x8_t, int8x8_t, vqadd, s8) +VQADD_IMPL(uint16x4_t, uint16x4_t, vqadd, u16) +VQADD_IMPL(int16x4_t, int16x4_t, vqadd, s16) +VQADD_IMPL(uint32x2_t, uint32x2_t, vqadd, u32) +VQADD_IMPL(int32x2_t, int32x2_t, vqadd, s32) +VQADD_IMPL(uint64x1_t, uint64x1_t, vqadd, u64) +VQADD_IMPL(int64x1_t, int64x1_t, vqadd, s64) +VQADD_IMPL(float32x2_t, float32x2_t, vadd, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VQADD_IMPL(float16x4_t, float16x4_t, vadd, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VQADD_IMPL(uint8x16_t, uint8x16_t, vqaddq, u8) +VQADD_IMPL(int8x16_t, int8x16_t, vqaddq, s8) +VQADD_IMPL(uint16x8_t, uint16x8_t, vqaddq, u16) +VQADD_IMPL(int16x8_t, int16x8_t, vqaddq, s16) +VQADD_IMPL(uint32x4_t, uint32x4_t, vqaddq, u32) +VQADD_IMPL(int32x4_t, int32x4_t, vqaddq, s32) +VQADD_IMPL(uint64x2_t, uint64x2_t, vqaddq, u64) +VQADD_IMPL(int64x2_t, int64x2_t, vqaddq, s64) +VQADD_IMPL(float32x4_t, float32x4_t, vaddq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VQADD_IMPL(float16x8_t, float16x8_t, vaddq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#undef VQADD_IMPL + +// VADDW: Vector widening add +#define VADDW_IMPL(wtype, vtype, prefix, postfix) \ + inline wtype vaddw(const wtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VADDW_IMPL(uint16x8_t, uint8x8_t, vaddw, u8) +VADDW_IMPL(int16x8_t, int8x8_t, vaddw, s8) +VADDW_IMPL(uint32x4_t, uint16x4_t, vaddw, u16) +VADDW_IMPL(int32x4_t, int16x4_t, vaddw, s16) +VADDW_IMPL(uint64x2_t, uint32x2_t, vaddw, u32) +VADDW_IMPL(int64x2_t, int32x2_t, vaddw, s32) +#undef VADDW_IMPL + +// VADDL: Vector long add +#define VADDL_IMPL(wtype, vtype, prefix, postfix) \ + inline wtype vaddl(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VADDL_IMPL(uint16x8_t, uint8x8_t, vaddl, u8) +VADDL_IMPL(int16x8_t, int8x8_t, vaddl, s8) +VADDL_IMPL(uint32x4_t, uint16x4_t, vaddl, u16) +VADDL_IMPL(int32x4_t, int16x4_t, vaddl, s16) +VADDL_IMPL(uint64x2_t, uint32x2_t, vaddl, u32) +VADDL_IMPL(int64x2_t, int32x2_t, vaddl, s32) +#undef VADDL_IMPL + +#if defined(__aarch64__) +// VADDV: Across vector add +#define VADDV_IMPL(stype, vtype, prefix, postfix) \ + inline stype vaddv(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VADDV_IMPL(uint8_t, uint8x8_t, vaddv, u8) +VADDV_IMPL(int8_t, int8x8_t, vaddv, s8) +VADDV_IMPL(uint16_t, uint16x4_t, vaddv, u16) +VADDV_IMPL(int16_t, int16x4_t, vaddv, s16) +VADDV_IMPL(uint32_t, uint32x2_t, vaddv, u32) +VADDV_IMPL(int32_t, int32x2_t, vaddv, s32) +VADDV_IMPL(float, float32x2_t, vaddv, f32) + +VADDV_IMPL(uint8_t, uint8x16_t, vaddvq, u8) +VADDV_IMPL(int8_t, int8x16_t, vaddvq, s8) +VADDV_IMPL(uint16_t, uint16x8_t, vaddvq, u16) +VADDV_IMPL(int16_t, int16x8_t, vaddvq, s16) +VADDV_IMPL(uint32_t, uint32x4_t, vaddvq, u32) +VADDV_IMPL(int32_t, int32x4_t, vaddvq, s32) +VADDV_IMPL(uint64_t, uint64x2_t, vaddvq, u64) +VADDV_IMPL(int64_t, int64x2_t, vaddvq, s64) +VADDV_IMPL(float, float32x4_t, vaddvq, f32) +#undef VADDV_IMPL +#endif // defined(__aarch64__) + +// VPADDL: Signed add long pairwise +#define VPADDL_IMPL(ltype, vtype, prefix, postfix) \ + inline ltype vpaddl(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VPADDL_IMPL(uint16x4_t, uint8x8_t, vpaddl, u8) +VPADDL_IMPL(int16x4_t, int8x8_t, vpaddl, s8) +VPADDL_IMPL(uint32x2_t, uint16x4_t, vpaddl, u16) +VPADDL_IMPL(int32x2_t, int16x4_t, vpaddl, s16) +VPADDL_IMPL(uint64x1_t, uint32x2_t, vpaddl, u32) +VPADDL_IMPL(int64x1_t, int32x2_t, vpaddl, s32) + +VPADDL_IMPL(uint16x8_t, uint8x16_t, vpaddlq, u8) +VPADDL_IMPL(int16x8_t, int8x16_t, vpaddlq, s8) +VPADDL_IMPL(uint32x4_t, uint16x8_t, vpaddlq, u16) +VPADDL_IMPL(int32x4_t, int16x8_t, vpaddlq, s16) +VPADDL_IMPL(uint64x2_t, uint32x4_t, vpaddlq, u32) +VPADDL_IMPL(int64x2_t, int32x4_t, vpaddlq, s32) +#undef VPADDL_IMPL + +// VPADD: Add pairwise +#define VPADD_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vpadd(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VPADD_IMPL(uint8x8_t, uint8x8_t, vpadd, u8) +VPADD_IMPL(int8x8_t, int8x8_t, vpadd, s8) +VPADD_IMPL(uint16x4_t, uint16x4_t, vpadd, u16) +VPADD_IMPL(int16x4_t, int16x4_t, vpadd, s16) +VPADD_IMPL(uint32x2_t, uint32x2_t, vpadd, u32) +VPADD_IMPL(int32x2_t, int32x2_t, vpadd, s32) +VPADD_IMPL(float32x2_t, float32x2_t, vpadd, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VPADD_IMPL(float16x4_t, float16x4_t, vpadd, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VPADD_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_ADD_H */ diff --git a/src/core/NEON/wrapper/intrinsics/and.h b/src/core/NEON/wrapper/intrinsics/and.h new file mode 100644 index 0000000000..6ff7df3f5a --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/and.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_AND_H +#define ARM_COMPUTE_WRAPPER_AND_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VAND_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vand(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VAND_IMPL(uint8_t, uint8x8_t, vand, u8) +VAND_IMPL(int8_t, int8x8_t, vand, s8) +VAND_IMPL(uint16_t, uint16x4_t, vand, u16) +VAND_IMPL(int16_t, int16x4_t, vand, s16) +VAND_IMPL(uint32_t, uint32x2_t, vand, u32) +VAND_IMPL(int32_t, int32x2_t, vand, s32) +VAND_IMPL(uint64_t, uint64x1_t, vand, u64) +VAND_IMPL(int64_t, int64x1_t, vand, s64) + +VAND_IMPL(uint8_t, uint8x16_t, vandq, u8) +VAND_IMPL(int8_t, int8x16_t, vandq, s8) +VAND_IMPL(uint16_t, uint16x8_t, vandq, u16) +VAND_IMPL(int16_t, int16x8_t, vandq, s16) +VAND_IMPL(uint32_t, uint32x4_t, vandq, u32) +VAND_IMPL(int32_t, int32x4_t, vandq, s32) +VAND_IMPL(uint64_t, uint64x2_t, vandq, u64) +VAND_IMPL(int64_t, int64x2_t, vandq, s64) + +#undef VAND_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_AND_H */ diff --git a/src/core/NEON/wrapper/intrinsics/bsl.h b/src/core/NEON/wrapper/intrinsics/bsl.h new file mode 100644 index 0000000000..01c1cce3a6 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/bsl.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_BSL_H +#define ARM_COMPUTE_WRAPPER_BSL_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VBSL_IMPL(stype, vtype, ctype, prefix, postfix) \ + inline vtype vbsl(const ctype &a, const vtype &b, const vtype &c) \ + { \ + return prefix##_##postfix(a, b, c); \ + } + +VBSL_IMPL(uint8_t, uint8x8_t, uint8x8_t, vbsl, u8) +VBSL_IMPL(int8_t, int8x8_t, uint8x8_t, vbsl, s8) +VBSL_IMPL(uint16_t, uint16x4_t, uint16x4_t, vbsl, u16) +VBSL_IMPL(int16_t, int16x4_t, uint16x4_t, vbsl, s16) +VBSL_IMPL(uint32_t, uint32x2_t, uint32x2_t, vbsl, u32) +VBSL_IMPL(int32_t, int32x2_t, uint32x2_t, vbsl, s32) +VBSL_IMPL(float32x2_t, float32x2_t, uint32x2_t, vbsl, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VBSL_IMPL(float16x4_t, float16x4_t, uint16x4_t, vbsl, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VBSL_IMPL(uint8_t, uint8x16_t, uint8x16_t, vbslq, u8) +VBSL_IMPL(int8_t, int8x16_t, uint8x16_t, vbslq, s8) +VBSL_IMPL(uint16_t, uint16x8_t, uint16x8_t, vbslq, u16) +VBSL_IMPL(int16_t, int16x8_t, uint16x8_t, vbslq, s16) +VBSL_IMPL(uint32_t, uint32x4_t, uint32x4_t, vbslq, u32) +VBSL_IMPL(int32_t, int32x4_t, uint32x4_t, vbslq, s32) +VBSL_IMPL(float32x4_t, float32x4_t, uint32x4_t, vbslq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VBSL_IMPL(float16x8_t, float16x8_t, uint16x8_t, vbslq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VBSL_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_BSL_H */ diff --git a/src/core/NEON/wrapper/intrinsics/ceq.h b/src/core/NEON/wrapper/intrinsics/ceq.h new file mode 100644 index 0000000000..b0324e63db --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/ceq.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_CEQ_H +#define ARM_COMPUTE_WRAPPER_CEQ_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VCEQ_IMPL(votype, vtype, prefix, postfix) \ + inline votype vceq(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VCEQ_IMPL(uint8x8_t, uint8x8_t, vceq, u8) +VCEQ_IMPL(uint8x8_t, int8x8_t, vceq, s8) +VCEQ_IMPL(uint16x4_t, uint16x4_t, vceq, u16) +VCEQ_IMPL(uint16x4_t, int16x4_t, vceq, s16) +VCEQ_IMPL(uint32x2_t, uint32x2_t, vceq, u32) +VCEQ_IMPL(uint32x2_t, int32x2_t, vceq, s32) +VCEQ_IMPL(uint32x2_t, float32x2_t, vceq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCEQ_IMPL(uint16x4_t, float16x4_t, vceq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VCEQ_IMPL(uint8x16_t, uint8x16_t, vceqq, u8) +VCEQ_IMPL(uint8x16_t, int8x16_t, vceqq, s8) +VCEQ_IMPL(uint16x8_t, uint16x8_t, vceqq, u16) +VCEQ_IMPL(uint16x8_t, int16x8_t, vceqq, s16) +VCEQ_IMPL(uint32x4_t, uint32x4_t, vceqq, u32) +VCEQ_IMPL(uint32x4_t, int32x4_t, vceqq, s32) +VCEQ_IMPL(uint32x4_t, float32x4_t, vceqq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCEQ_IMPL(uint16x8_t, float16x8_t, vceqq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VCEQ_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_CEQ_H */ diff --git a/src/core/NEON/wrapper/intrinsics/cge.h b/src/core/NEON/wrapper/intrinsics/cge.h new file mode 100644 index 0000000000..e4a7fcd423 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/cge.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_CGE_H +#define ARM_COMPUTE_WRAPPER_CGE_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VCGE_IMPL(stype, vtype, rtype, prefix, postfix) \ + inline rtype vcge(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VCGE_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcge, u8) +VCGE_IMPL(int8_t, int8x8_t, uint8x8_t, vcge, s8) +VCGE_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcge, u16) +VCGE_IMPL(int16_t, int16x4_t, uint16x4_t, vcge, s16) +VCGE_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcge, u32) +VCGE_IMPL(int32_t, int32x2_t, uint32x2_t, vcge, s32) +VCGE_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcge, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCGE_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcge, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VCGE_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcgeq, u8) +VCGE_IMPL(int8_t, int8x16_t, uint8x16_t, vcgeq, s8) +VCGE_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcgeq, u16) +VCGE_IMPL(int16_t, int16x8_t, uint16x8_t, vcgeq, s16) +VCGE_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcgeq, u32) +VCGE_IMPL(int32_t, int32x4_t, uint32x4_t, vcgeq, s32) +VCGE_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcgeq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCGE_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcgeq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VCGE_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_CGE_H */ diff --git a/src/core/NEON/wrapper/intrinsics/cgt.h b/src/core/NEON/wrapper/intrinsics/cgt.h new file mode 100644 index 0000000000..f34d02fd1b --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/cgt.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_CGT_H +#define ARM_COMPUTE_WRAPPER_CGT_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VCGT_IMPL(rtype, vtype, prefix, postfix) \ + inline rtype vcgt(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VCGT_IMPL(uint8x8_t, uint8x8_t, vcgt, u8) +VCGT_IMPL(uint8x8_t, int8x8_t, vcgt, s8) +VCGT_IMPL(uint16x4_t, uint16x4_t, vcgt, u16) +VCGT_IMPL(uint16x4_t, int16x4_t, vcgt, s16) +VCGT_IMPL(uint32x2_t, uint32x2_t, vcgt, u32) +VCGT_IMPL(uint32x2_t, int32x2_t, vcgt, s32) +VCGT_IMPL(uint32x2_t, float32x2_t, vcgt, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCGT_IMPL(uint16x4_t, float16x4_t, vcgt, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VCGT_IMPL(uint8x16_t, uint8x16_t, vcgtq, u8) +VCGT_IMPL(uint8x16_t, int8x16_t, vcgtq, s8) +VCGT_IMPL(uint16x8_t, uint16x8_t, vcgtq, u16) +VCGT_IMPL(uint16x8_t, int16x8_t, vcgtq, s16) +VCGT_IMPL(uint32x4_t, uint32x4_t, vcgtq, u32) +VCGT_IMPL(uint32x4_t, int32x4_t, vcgtq, s32) +VCGT_IMPL(uint32x4_t, float32x4_t, vcgtq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCGT_IMPL(uint16x8_t, float16x8_t, vcgtq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VCGT_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_CGT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/cle.h b/src/core/NEON/wrapper/intrinsics/cle.h new file mode 100644 index 0000000000..50c175f0c8 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/cle.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_CLE_H +#define ARM_COMPUTE_WRAPPER_CLE_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VCLE_IMPL(stype, vtype, rtype, prefix, postfix) \ + inline rtype vcle(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VCLE_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcle, u8) +VCLE_IMPL(int8_t, int8x8_t, uint8x8_t, vcle, s8) +VCLE_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcle, u16) +VCLE_IMPL(int16_t, int16x4_t, uint16x4_t, vcle, s16) +VCLE_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcle, u32) +VCLE_IMPL(int32_t, int32x2_t, uint32x2_t, vcle, s32) +VCLE_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcle, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCLE_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcle, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VCLE_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcleq, u8) +VCLE_IMPL(int8_t, int8x16_t, uint8x16_t, vcleq, s8) +VCLE_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcleq, u16) +VCLE_IMPL(int16_t, int16x8_t, uint16x8_t, vcleq, s16) +VCLE_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcleq, u32) +VCLE_IMPL(int32_t, int32x4_t, uint32x4_t, vcleq, s32) +VCLE_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcleq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCLE_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcleq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VCLE_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_CLE_H */ diff --git a/src/core/NEON/wrapper/intrinsics/clt.h b/src/core/NEON/wrapper/intrinsics/clt.h new file mode 100644 index 0000000000..10fd320e4c --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/clt.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_CLT_H +#define ARM_COMPUTE_WRAPPER_CLT_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VCLT_IMPL(votype, vtype, prefix, postfix) \ + inline votype vclt(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VCLT_IMPL(uint8x8_t, uint8x8_t, vclt, u8) +VCLT_IMPL(uint8x8_t, int8x8_t, vclt, s8) +VCLT_IMPL(uint16x4_t, uint16x4_t, vclt, u16) +VCLT_IMPL(uint16x4_t, int16x4_t, vclt, s16) +VCLT_IMPL(uint32x2_t, uint32x2_t, vclt, u32) +VCLT_IMPL(uint32x2_t, int32x2_t, vclt, s32) +VCLT_IMPL(uint32x2_t, float32x2_t, vclt, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCLT_IMPL(uint16x4_t, float16x4_t, vclt, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VCLT_IMPL(uint8x16_t, uint8x16_t, vcltq, u8) +VCLT_IMPL(uint8x16_t, int8x16_t, vcltq, s8) +VCLT_IMPL(uint16x8_t, uint16x8_t, vcltq, u16) +VCLT_IMPL(uint16x8_t, int16x8_t, vcltq, s16) +VCLT_IMPL(uint32x4_t, uint32x4_t, vcltq, u32) +VCLT_IMPL(uint32x4_t, int32x4_t, vcltq, s32) +VCLT_IMPL(uint32x4_t, float32x4_t, vcltq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCLT_IMPL(uint16x8_t, float16x8_t, vcltq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VCLT_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_CLT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/combine.h b/src/core/NEON/wrapper/intrinsics/combine.h new file mode 100644 index 0000000000..8b6a588f51 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/combine.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_COMBINE_H +#define ARM_COMPUTE_WRAPPER_COMBINE_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VCOMBINE_IMPL(rtype, vtype, prefix, postfix) \ + inline rtype vcombine(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VCOMBINE_IMPL(uint8x16_t, uint8x8_t, vcombine, u8) +VCOMBINE_IMPL(int8x16_t, int8x8_t, vcombine, s8) +VCOMBINE_IMPL(uint16x8_t, uint16x4_t, vcombine, u16) +VCOMBINE_IMPL(int16x8_t, int16x4_t, vcombine, s16) +VCOMBINE_IMPL(uint32x4_t, uint32x2_t, vcombine, u32) +VCOMBINE_IMPL(int32x4_t, int32x2_t, vcombine, s32) +VCOMBINE_IMPL(float32x4_t, float32x2_t, vcombine, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCOMBINE_IMPL(float16x8_t, float16x4_t, vcombine, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VCOMBINE_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_COMBINE_H */ diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h new file mode 100644 index 0000000000..6e79a92bc2 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/cvt.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_CVT_H +#define ARM_COMPUTE_WRAPPER_CVT_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ + template \ + inline typename std::enable_if::value, float32x4_t>::type \ + vcvt(const vtype &a) \ + { \ + return prefix##_##postfix1##_##postfix2(a); \ + } + +VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32) +VCVT_TO_F32_IMPL(float32x4_t, int32x4_t, vcvtq, f32, s32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#undef VCVT_TO_F32_IMPL + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ + template \ + inline typename std::enable_if::value, float16x4_t>::type \ + vcvt(const vtype &a) \ + { \ + return prefix##_##postfix1##_##postfix2(a); \ + } + +VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32) +#undef VCVT_TO_F16_IMPL +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template +inline typename std::enable_if::value, uint32x4_t>::type +vcvt(const float32x4_t &a) +{ + return vcvtq_u32_f32(a); +} + +template +inline typename std::enable_if::value, int32x4_t>::type +vcvt(const float32x4_t &a) +{ + return vcvtq_s32_f32(a); +} + +#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) +/** Convert 2x128-bit floating point vectors into 1x128-bit bfloat16 vector + * + * @param[in] inptr Pointer to the input memory to load values from + * @param[in,out] outptr Pointer to the output memory to store values to + */ +inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr) +{ + __asm __volatile( + "ldp q0, q1, [%[inptr]]\n" + ".inst 0xea16800\n" // BFCVTN v0, v0 + ".inst 0x4ea16820\n" // BFCVTN2 v0, v1 + "str q0, [%[outptr]]\n" + : [inptr] "+r"(inptr) + : [outptr] "r"(outptr) + : "v0", "v1", "memory"); +} +#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ + +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_CVT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/div.h b/src/core/NEON/wrapper/intrinsics/div.h new file mode 100644 index 0000000000..265f30d33b --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/div.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_DIV_H +#define ARM_COMPUTE_WRAPPER_DIV_H + +#include "src/core/NEON/NEMath.h" +#include + +namespace arm_compute +{ +namespace wrapper +{ +#ifdef __aarch64__ + +#define VDIV_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vdiv(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } +VDIV_IMPL(float32x2_t, float32x2_t, vdiv, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VDIV_IMPL(float16x4_t, float16x4_t, vdiv, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VDIV_IMPL(float32x4_t, float32x4_t, vdivq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VDIV_IMPL(float16x8_t, float16x8_t, vdivq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#else // __aarch64__ + +#define VDIV_IMPL(stype, vtype, mul_prefix, inv_prefix, postfix) \ + inline vtype vdiv(const vtype &a, const vtype &b) \ + { \ + return mul_prefix##_##postfix(a, inv_prefix##_##postfix(b)); \ + } +VDIV_IMPL(float32x2_t, float32x2_t, vmul, vinv, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VDIV_IMPL(float16x4_t, float16x4_t, vmul, vinv, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VDIV_IMPL(float32x4_t, float32x4_t, vmulq, vinvq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VDIV_IMPL(float16x8_t, float16x8_t, vmulq, vinvq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#endif // __aarch64__ + +#undef VDIV_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_DIV_H */ diff --git a/src/core/NEON/wrapper/intrinsics/dup_n.h b/src/core/NEON/wrapper/intrinsics/dup_n.h new file mode 100644 index 0000000000..e745aa4a8c --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/dup_n.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_DUP_N_H +#define ARM_COMPUTE_WRAPPER_DUP_N_H + +#include "src/core/NEON/wrapper/traits.h" + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VDUP_N_IMPL(stype, vtype, prefix, postfix, tag) \ + inline vtype vdup_n(stype value, tag) \ + { \ + return prefix##_##postfix(value); \ + } + +VDUP_N_IMPL(uint8_t, uint8x8_t, vdup_n, u8, traits::vector_64_tag) +VDUP_N_IMPL(int8_t, int8x8_t, vdup_n, s8, traits::vector_64_tag) +VDUP_N_IMPL(uint16_t, uint16x4_t, vdup_n, u16, traits::vector_64_tag) +VDUP_N_IMPL(int16_t, int16x4_t, vdup_n, s16, traits::vector_64_tag) +VDUP_N_IMPL(uint32_t, uint32x2_t, vdup_n, u32, traits::vector_64_tag) +VDUP_N_IMPL(int32_t, int32x2_t, vdup_n, s32, traits::vector_64_tag) +VDUP_N_IMPL(float, float32x2_t, vdup_n, f32, traits::vector_64_tag) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VDUP_N_IMPL(float16_t, float16x4_t, vdup_n, f16, traits::vector_64_tag) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VDUP_N_IMPL(uint8_t, uint8x16_t, vdupq_n, u8, traits::vector_128_tag) +VDUP_N_IMPL(int8_t, int8x16_t, vdupq_n, s8, traits::vector_128_tag) +VDUP_N_IMPL(uint16_t, uint16x8_t, vdupq_n, u16, traits::vector_128_tag) +VDUP_N_IMPL(int16_t, int16x8_t, vdupq_n, s16, traits::vector_128_tag) +VDUP_N_IMPL(uint32_t, uint32x4_t, vdupq_n, u32, traits::vector_128_tag) +VDUP_N_IMPL(int32_t, int32x4_t, vdupq_n, s32, traits::vector_128_tag) +VDUP_N_IMPL(float, float32x4_t, vdupq_n, f32, traits::vector_128_tag) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VDUP_N_IMPL(float16_t, float16x8_t, vdupq_n, f16, traits::vector_128_tag) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VDUP_N_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_DUP_N_H */ diff --git a/src/core/NEON/wrapper/intrinsics/eor.h b/src/core/NEON/wrapper/intrinsics/eor.h new file mode 100644 index 0000000000..ce88cf59e7 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/eor.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_EOR_H +#define ARM_COMPUTE_WRAPPER_EOR_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VEOR_IMPL(vtype, prefix, postfix) \ + inline vtype veor(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VEOR_IMPL(uint8x8_t, veor, u8) +VEOR_IMPL(int8x8_t, veor, s8) +VEOR_IMPL(uint16x4_t, veor, u16) +VEOR_IMPL(int16x4_t, veor, s16) +VEOR_IMPL(uint32x2_t, veor, u32) +VEOR_IMPL(int32x2_t, veor, s32) + +VEOR_IMPL(uint8x16_t, veorq, u8) +VEOR_IMPL(int8x16_t, veorq, s8) +VEOR_IMPL(uint16x8_t, veorq, u16) +VEOR_IMPL(int16x8_t, veorq, s16) +VEOR_IMPL(uint32x4_t, veorq, u32) +VEOR_IMPL(int32x4_t, veorq, s32) + +#undef VEOR_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_EOR_H */ diff --git a/src/core/NEON/wrapper/intrinsics/exp.h b/src/core/NEON/wrapper/intrinsics/exp.h new file mode 100644 index 0000000000..c2a6970967 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/exp.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_EXP_H +#define ARM_COMPUTE_WRAPPER_EXP_H + +#include "src/core/NEON/NEMath.h" +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VEXPQ_IMPL(vtype, postfix) \ + inline vtype vexpq(const vtype &a) \ + { \ + return vexpq_##postfix(a); \ + } + +#define VEXPQ_IMPL_INT(vtype, postfix) \ + inline vtype vexpq(const vtype &a) \ + { \ + ARM_COMPUTE_UNUSED(a); \ + ARM_COMPUTE_ERROR("Not supported"); \ + } + +VEXPQ_IMPL(float32x4_t, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VEXPQ_IMPL(float16x8_t, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VEXPQ_IMPL_INT(int32x4_t, s32) +#undef VEXPQ_IMPL + +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_EXP_H */ diff --git a/src/core/NEON/wrapper/intrinsics/ext.h b/src/core/NEON/wrapper/intrinsics/ext.h new file mode 100644 index 0000000000..d44b231bb2 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/ext.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_EXT_H +#define ARM_COMPUTE_WRAPPER_EXT_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VEXT_IMPL(vtype, prefix, postfix, size) \ + inline vtype vext_##size(vtype value_a, vtype value_b) \ + { \ + return prefix##_##postfix(value_a, value_b, size); \ + } + +VEXT_IMPL(uint8x8_t, vext, u8, 1) +VEXT_IMPL(uint8x8_t, vext, u8, 2) +VEXT_IMPL(int8x8_t, vext, s8, 1) +VEXT_IMPL(int8x8_t, vext, s8, 2) +VEXT_IMPL(uint16x4_t, vext, u16, 1) +VEXT_IMPL(uint16x4_t, vext, u16, 2) +VEXT_IMPL(int16x4_t, vext, s16, 1) +VEXT_IMPL(int16x4_t, vext, s16, 2) + +VEXT_IMPL(uint8x16_t, vextq, u8, 1) +VEXT_IMPL(uint8x16_t, vextq, u8, 2) +VEXT_IMPL(int8x16_t, vextq, s8, 1) +VEXT_IMPL(int8x16_t, vextq, s8, 2) +VEXT_IMPL(uint16x8_t, vextq, u16, 1) +VEXT_IMPL(uint16x8_t, vextq, u16, 2) +VEXT_IMPL(int16x8_t, vextq, s16, 1) +VEXT_IMPL(int16x8_t, vextq, s16, 2) +VEXT_IMPL(int32x4_t, vextq, s32, 1) +VEXT_IMPL(int32x4_t, vextq, s32, 2) + +#undef VEXT_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_EXT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/gethigh.h b/src/core/NEON/wrapper/intrinsics/gethigh.h new file mode 100644 index 0000000000..d098a27335 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/gethigh.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_GET_HIGH_H +#define ARM_COMPUTE_WRAPPER_GET_HIGH_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VGETHIGH_IMPL(half_vtype, vtype, postfix) \ + inline half_vtype vgethigh(const vtype val) \ + { \ + return vget_high_##postfix(val); \ + } + +VGETHIGH_IMPL(uint8x8_t, uint8x16_t, u8) +VGETHIGH_IMPL(int8x8_t, int8x16_t, s8) +VGETHIGH_IMPL(uint16x4_t, uint16x8_t, u16) +VGETHIGH_IMPL(int16x4_t, int16x8_t, s16) +VGETHIGH_IMPL(uint32x2_t, uint32x4_t, u32) +VGETHIGH_IMPL(int32x2_t, int32x4_t, s32) +VGETHIGH_IMPL(float32x2_t, float32x4_t, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VGETHIGH_IMPL(float16x4_t, float16x8_t, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VGETHIGH_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_GET_HIGH_H */ diff --git a/src/core/NEON/wrapper/intrinsics/getlane.h b/src/core/NEON/wrapper/intrinsics/getlane.h new file mode 100644 index 0000000000..2052751612 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/getlane.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_GET_LANE_H +#define ARM_COMPUTE_WRAPPER_GET_LANE_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VGETLANE_IMPL_8(stype, vtype, postfix) \ + inline stype vgetlane(const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vget_lane_##postfix(vector, 0); \ + case 1: \ + return vget_lane_##postfix(vector, 1); \ + case 2: \ + return vget_lane_##postfix(vector, 2); \ + case 3: \ + return vget_lane_##postfix(vector, 3); \ + case 4: \ + return vget_lane_##postfix(vector, 4); \ + case 5: \ + return vget_lane_##postfix(vector, 5); \ + case 6: \ + return vget_lane_##postfix(vector, 6); \ + case 7: \ + return vget_lane_##postfix(vector, 7); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +#define VGETLANE_IMPL_4(stype, vtype, postfix) \ + inline stype vgetlane(const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vget_lane_##postfix(vector, 0); \ + case 1: \ + return vget_lane_##postfix(vector, 1); \ + case 2: \ + return vget_lane_##postfix(vector, 2); \ + case 3: \ + return vget_lane_##postfix(vector, 3); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +#define VGETLANE_IMPL_2(stype, vtype, postfix) \ + inline stype vgetlane(const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vget_lane_##postfix(vector, 0); \ + case 1: \ + return vget_lane_##postfix(vector, 1); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +VGETLANE_IMPL_8(uint8_t, uint8x8_t, u8) +VGETLANE_IMPL_8(int8_t, int8x8_t, s8) +VGETLANE_IMPL_4(uint16_t, uint16x4_t, u16) +VGETLANE_IMPL_4(int16_t, int16x4_t, s16) +VGETLANE_IMPL_2(uint32_t, uint32x2_t, u32) +VGETLANE_IMPL_2(int32_t, int32x2_t, s32) +VGETLANE_IMPL_2(float, float32x2_t, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VGETLANE_IMPL_4(float16_t, float16x4_t, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#define VGETQLANE_IMPL_16(stype, vtype, postfix) \ + inline stype vgetlane(const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vgetq_lane_##postfix(vector, 0); \ + case 1: \ + return vgetq_lane_##postfix(vector, 1); \ + case 2: \ + return vgetq_lane_##postfix(vector, 2); \ + case 3: \ + return vgetq_lane_##postfix(vector, 3); \ + case 4: \ + return vgetq_lane_##postfix(vector, 4); \ + case 5: \ + return vgetq_lane_##postfix(vector, 5); \ + case 6: \ + return vgetq_lane_##postfix(vector, 6); \ + case 7: \ + return vgetq_lane_##postfix(vector, 7); \ + case 8: \ + return vgetq_lane_##postfix(vector, 8); \ + case 9: \ + return vgetq_lane_##postfix(vector, 9); \ + case 10: \ + return vgetq_lane_##postfix(vector, 10); \ + case 11: \ + return vgetq_lane_##postfix(vector, 11); \ + case 12: \ + return vgetq_lane_##postfix(vector, 12); \ + case 13: \ + return vgetq_lane_##postfix(vector, 13); \ + case 14: \ + return vgetq_lane_##postfix(vector, 14); \ + case 15: \ + return vgetq_lane_##postfix(vector, 15); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +#define VGETQLANE_IMPL_8(stype, vtype, postfix) \ + inline stype vgetlane(const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vgetq_lane_##postfix(vector, 0); \ + case 1: \ + return vgetq_lane_##postfix(vector, 1); \ + case 2: \ + return vgetq_lane_##postfix(vector, 2); \ + case 3: \ + return vgetq_lane_##postfix(vector, 3); \ + case 4: \ + return vgetq_lane_##postfix(vector, 4); \ + case 5: \ + return vgetq_lane_##postfix(vector, 5); \ + case 6: \ + return vgetq_lane_##postfix(vector, 6); \ + case 7: \ + return vgetq_lane_##postfix(vector, 7); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +#define VGETQLANE_IMPL_4(stype, vtype, postfix) \ + inline stype vgetlane(const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vgetq_lane_##postfix(vector, 0); \ + case 1: \ + return vgetq_lane_##postfix(vector, 1); \ + case 2: \ + return vgetq_lane_##postfix(vector, 2); \ + case 3: \ + return vgetq_lane_##postfix(vector, 3); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +#define VGETQLANE_IMPL_2(stype, vtype, postfix) \ + inline stype vgetlane(const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vgetq_lane_##postfix(vector, 0); \ + case 1: \ + return vgetq_lane_##postfix(vector, 1); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +VGETQLANE_IMPL_16(uint8_t, uint8x16_t, u8) +VGETQLANE_IMPL_16(int8_t, int8x16_t, s8) +VGETQLANE_IMPL_8(uint16_t, uint16x8_t, u16) +VGETQLANE_IMPL_8(int16_t, int16x8_t, s16) +VGETQLANE_IMPL_4(uint32_t, uint32x4_t, u32) +VGETQLANE_IMPL_4(int32_t, int32x4_t, s32) +VGETQLANE_IMPL_4(float, float32x4_t, f32) +VGETQLANE_IMPL_2(int64_t, int64x2_t, s64) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VGETQLANE_IMPL_8(float16_t, float16x8_t, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VGETLANE_IMPL_8 +#undef VGETLANE_IMPL_4 +#undef VGETLANE_IMPL_2 + +#undef VGETQLANE_IMPL_16 +#undef VGETQLANE_IMPL_8 +#undef VGETQLANE_IMPL_4 +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_GET_LANE_H */ diff --git a/src/core/NEON/wrapper/intrinsics/getlow.h b/src/core/NEON/wrapper/intrinsics/getlow.h new file mode 100644 index 0000000000..b5469f0eab --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/getlow.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_GET_LOW_H +#define ARM_COMPUTE_WRAPPER_GET_LOW_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VGETLOW_IMPL(half_vtype, vtype, postfix) \ + inline half_vtype vgetlow(const vtype val) \ + { \ + return vget_low_##postfix(val); \ + } + +VGETLOW_IMPL(uint8x8_t, uint8x16_t, u8) +VGETLOW_IMPL(int8x8_t, int8x16_t, s8) +VGETLOW_IMPL(uint16x4_t, uint16x8_t, u16) +VGETLOW_IMPL(int16x4_t, int16x8_t, s16) +VGETLOW_IMPL(uint32x2_t, uint32x4_t, u32) +VGETLOW_IMPL(int32x2_t, int32x4_t, s32) +VGETLOW_IMPL(float32x2_t, float32x4_t, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VGETLOW_IMPL(float16x4_t, float16x8_t, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VGETLOW_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_GET_LOW_H */ diff --git a/src/core/NEON/wrapper/intrinsics/intrinsics.h b/src/core/NEON/wrapper/intrinsics/intrinsics.h new file mode 100644 index 0000000000..495321a6a1 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/intrinsics.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_INTRINSICS_H +#define ARM_COMPUTE_WRAPPER_INTRINSICS_H + +#include "src/core/NEON/wrapper/intrinsics/abs.h" +#include "src/core/NEON/wrapper/intrinsics/add.h" +#include "src/core/NEON/wrapper/intrinsics/and.h" +#include "src/core/NEON/wrapper/intrinsics/bsl.h" +#include "src/core/NEON/wrapper/intrinsics/ceq.h" +#include "src/core/NEON/wrapper/intrinsics/cge.h" +#include "src/core/NEON/wrapper/intrinsics/cgt.h" +#include "src/core/NEON/wrapper/intrinsics/cle.h" +#include "src/core/NEON/wrapper/intrinsics/clt.h" +#include "src/core/NEON/wrapper/intrinsics/combine.h" +#include "src/core/NEON/wrapper/intrinsics/cvt.h" +#include "src/core/NEON/wrapper/intrinsics/div.h" +#include "src/core/NEON/wrapper/intrinsics/dup_n.h" +#include "src/core/NEON/wrapper/intrinsics/eor.h" +#include "src/core/NEON/wrapper/intrinsics/exp.h" +#include "src/core/NEON/wrapper/intrinsics/ext.h" +#include "src/core/NEON/wrapper/intrinsics/gethigh.h" +#include "src/core/NEON/wrapper/intrinsics/getlane.h" +#include "src/core/NEON/wrapper/intrinsics/getlow.h" +#include "src/core/NEON/wrapper/intrinsics/inv.h" +#include "src/core/NEON/wrapper/intrinsics/invsqrt.h" +#include "src/core/NEON/wrapper/intrinsics/load.h" +#include "src/core/NEON/wrapper/intrinsics/log.h" +#include "src/core/NEON/wrapper/intrinsics/max.h" +#include "src/core/NEON/wrapper/intrinsics/min.h" +#include "src/core/NEON/wrapper/intrinsics/mla.h" +#include "src/core/NEON/wrapper/intrinsics/movl.h" +#include "src/core/NEON/wrapper/intrinsics/movn.h" +#include "src/core/NEON/wrapper/intrinsics/mul.h" +#include "src/core/NEON/wrapper/intrinsics/neg.h" +#include "src/core/NEON/wrapper/intrinsics/not.h" +#include "src/core/NEON/wrapper/intrinsics/orr.h" +#include "src/core/NEON/wrapper/intrinsics/pmax.h" +#include "src/core/NEON/wrapper/intrinsics/pmin.h" +#include "src/core/NEON/wrapper/intrinsics/pow.h" +#include "src/core/NEON/wrapper/intrinsics/qmov.h" +#include "src/core/NEON/wrapper/intrinsics/qmovun.h" +#include "src/core/NEON/wrapper/intrinsics/reinterpret.h" +#include "src/core/NEON/wrapper/intrinsics/rev64.h" +#include "src/core/NEON/wrapper/intrinsics/round.h" +#include "src/core/NEON/wrapper/intrinsics/setlane.h" +#include "src/core/NEON/wrapper/intrinsics/sin.h" +#include "src/core/NEON/wrapper/intrinsics/store.h" +#include "src/core/NEON/wrapper/intrinsics/sub.h" +#include "src/core/NEON/wrapper/intrinsics/tanh.h" +#include "src/core/NEON/wrapper/intrinsics/tbl.h" + +#endif /* ARM_COMPUTE_WRAPPER_INTRINSICS_H */ diff --git a/src/core/NEON/wrapper/intrinsics/inv.h b/src/core/NEON/wrapper/intrinsics/inv.h new file mode 100644 index 0000000000..de398b0403 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/inv.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_INV_H +#define ARM_COMPUTE_WRAPPER_INV_H + +#include "src/core/NEON/NEMath.h" +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VINV_IMPL(vtype, prefix, postfix) \ + inline vtype vinv(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +#define VINV_IMPL_INT(vtype, prefix, postfix) \ + inline vtype vinv(const vtype &a) \ + { \ + ARM_COMPUTE_UNUSED(a); \ + ARM_COMPUTE_ERROR("Not supported"); \ + } + +VINV_IMPL(float32x2_t, vinv, f32) +VINV_IMPL_INT(int32x2_t, vinv, s32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VINV_IMPL(float16x4_t, vinv, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VINV_IMPL(float32x4_t, vinvq, f32) +VINV_IMPL_INT(int32x4_t, vinvq, s32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VINV_IMPL(float16x8_t, vinvq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VINV_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_INV_H */ diff --git a/src/core/NEON/wrapper/intrinsics/invsqrt.h b/src/core/NEON/wrapper/intrinsics/invsqrt.h new file mode 100644 index 0000000000..2343efa8f8 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/invsqrt.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_INVSQRT_H +#define ARM_COMPUTE_WRAPPER_INVSQRT_H + +#include "src/core/NEON/NEMath.h" +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VINVSQRT_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vinvsqrt(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +#define VINVSQRT_IMPL_INT(stype, vtype, prefix, postfix) \ + inline vtype vinvsqrt(const vtype &a) \ + { \ + ARM_COMPUTE_UNUSED(a); \ + ARM_COMPUTE_ERROR("Not supported"); \ + } + +VINVSQRT_IMPL(float, float32x2_t, vinvsqrt, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VINVSQRT_IMPL(float16_t, float16x4_t, vinvsqrt, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VINVSQRT_IMPL_INT(int, int32x4_t, vinvsqrt, s32) + +VINVSQRT_IMPL(float, float32x4_t, vinvsqrtq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VINVSQRT_IMPL(float16_t, float16x8_t, vinvsqrtq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VINVSQRT_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_INVSQRT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/load.h b/src/core/NEON/wrapper/intrinsics/load.h new file mode 100644 index 0000000000..a2116c028b --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/load.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_LOAD_H +#define ARM_COMPUTE_WRAPPER_LOAD_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VLOAD_IMPL(stype, vtype, postfix) \ + inline vtype vload(const stype *ptr) \ + { \ + return vld1_##postfix(ptr); \ + } + +VLOAD_IMPL(uint8_t, uint8x8_t, u8) +VLOAD_IMPL(int8_t, int8x8_t, s8) +VLOAD_IMPL(uint16_t, uint16x4_t, u16) +VLOAD_IMPL(int16_t, int16x4_t, s16) +VLOAD_IMPL(uint32_t, uint32x2_t, u32) +VLOAD_IMPL(int32_t, int32x2_t, s32) +//VLOAD_IMPL(uint64_t, uint64x1_t, u64) +//VLOAD_IMPL(int64_t, int64x1_t, s64) +VLOAD_IMPL(float, float32x2_t, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VLOAD_IMPL(float16_t, float16x4_t, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#define VLOADQ_IMPL(stype, vtype, postfix) \ + inline vtype vloadq(const stype *ptr) \ + { \ + return vld1q_##postfix(ptr); \ + } + +VLOADQ_IMPL(uint8_t, uint8x16_t, u8) +VLOADQ_IMPL(int8_t, int8x16_t, s8) +VLOADQ_IMPL(uint16_t, uint16x8_t, u16) +VLOADQ_IMPL(int16_t, int16x8_t, s16) +VLOADQ_IMPL(uint32_t, uint32x4_t, u32) +VLOADQ_IMPL(int32_t, int32x4_t, s32) +//VLOAD_IMPL(uint64_t, uint64x1_t, u64) +//VLOAD_IMPL(int64_t, int64x1_t, s64) +VLOADQ_IMPL(float, float32x4_t, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VLOADQ_IMPL(float16_t, float16x8_t, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#undef VLOAD_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_LOAD_H */ diff --git a/src/core/NEON/wrapper/intrinsics/log.h b/src/core/NEON/wrapper/intrinsics/log.h new file mode 100644 index 0000000000..357a77ca78 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/log.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_LOG_H +#define ARM_COMPUTE_WRAPPER_LOG_H + +#include "src/core/NEON/NEMath.h" +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VLOG_IMPL(vtype, prefix, postfix) \ + inline vtype vlog(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +#define VLOG_IMPL_INT(vtype, prefix, postfix) \ + inline vtype vlog(const vtype &a) \ + { \ + ARM_COMPUTE_UNUSED(a); \ + ARM_COMPUTE_ERROR("Not supported"); \ + } + +VLOG_IMPL(float32x4_t, vlogq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VLOG_IMPL(float16x8_t, vlogq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VLOG_IMPL_INT(int32x4_t, vlogq, s32) + +#undef VLOG_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_LOG_H */ diff --git a/src/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h new file mode 100644 index 0000000000..cec437d171 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/max.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_MAX_H +#define ARM_COMPUTE_WRAPPER_MAX_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VMAX_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vmax(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VMAX_IMPL(uint8_t, uint8x8_t, vmax, u8) +VMAX_IMPL(int8_t, int8x8_t, vmax, s8) +VMAX_IMPL(uint16_t, uint16x4_t, vmax, u16) +VMAX_IMPL(int16_t, int16x4_t, vmax, s16) +VMAX_IMPL(uint32_t, uint32x2_t, vmax, u32) +VMAX_IMPL(int32_t, int32x2_t, vmax, s32) +VMAX_IMPL(float, float32x2_t, vmax, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMAX_IMPL(float16_t, float16x4_t, vmax, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VMAX_IMPL(uint8_t, uint8x16_t, vmaxq, u8) +VMAX_IMPL(int8_t, int8x16_t, vmaxq, s8) +VMAX_IMPL(uint16_t, uint16x8_t, vmaxq, u16) +VMAX_IMPL(int16_t, int16x8_t, vmaxq, s16) +VMAX_IMPL(uint32_t, uint32x4_t, vmaxq, u32) +VMAX_IMPL(int32_t, int32x4_t, vmaxq, s32) +VMAX_IMPL(float, float32x4_t, vmaxq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VMAX_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_MAX_H */ diff --git a/src/core/NEON/wrapper/intrinsics/min.h b/src/core/NEON/wrapper/intrinsics/min.h new file mode 100644 index 0000000000..8afcb3cb10 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/min.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_MIN_H +#define ARM_COMPUTE_WRAPPER_MIN_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VMIN_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vmin(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VMIN_IMPL(uint8_t, uint8x8_t, vmin, u8) +VMIN_IMPL(int8_t, int8x8_t, vmin, s8) +VMIN_IMPL(uint16_t, uint16x4_t, vmin, u16) +VMIN_IMPL(int16_t, int16x4_t, vmin, s16) +VMIN_IMPL(uint32_t, uint32x2_t, vmin, u32) +VMIN_IMPL(int32_t, int32x2_t, vmin, s32) +VMIN_IMPL(float, float32x2_t, vmin, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMIN_IMPL(float16_t, float16x4_t, vmin, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VMIN_IMPL(uint8_t, uint8x16_t, vminq, u8) +VMIN_IMPL(int8_t, int8x16_t, vminq, s8) +VMIN_IMPL(uint16_t, uint16x8_t, vminq, u16) +VMIN_IMPL(int16_t, int16x8_t, vminq, s16) +VMIN_IMPL(uint32_t, uint32x4_t, vminq, u32) +VMIN_IMPL(int32_t, int32x4_t, vminq, s32) +VMIN_IMPL(float, float32x4_t, vminq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMIN_IMPL(float16_t, float16x8_t, vminq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VMIN_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_MIN_H */ diff --git a/src/core/NEON/wrapper/intrinsics/mla.h b/src/core/NEON/wrapper/intrinsics/mla.h new file mode 100644 index 0000000000..2b38b34137 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/mla.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_MLA_H +#define ARM_COMPUTE_WRAPPER_MLA_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VMLA_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vmla(const vtype &a, const vtype &b, const vtype &c) \ + { \ + return prefix##_##postfix(a, b, c); \ + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#define VMLA_IMPL2(stype, vtype, prefix1, prefix2, postfix) \ + inline vtype vmla(const vtype &a, const vtype &b, const vtype &c) \ + { \ + return prefix1##_##postfix(a, prefix2##_##postfix(b, c)); \ + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VMLA_IMPL(uint8x8_t, uint8x8_t, vmla, u8) +VMLA_IMPL(int8x8_t, int8x8_t, vmla, s8) +VMLA_IMPL(uint16x4_t, uint16x4_t, vmla, u16) +VMLA_IMPL(int16x4_t, int16x4_t, vmla, s16) +VMLA_IMPL(uint32x2_t, uint32x2_t, vmla, u32) +VMLA_IMPL(int32x2_t, int32x2_t, vmla, s32) +VMLA_IMPL(float32x2_t, float32x2_t, vmla, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMLA_IMPL2(float16x4_t, float16x4_t, vadd, vmul, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VMLA_IMPL(uint8x16_t, uint8x16_t, vmlaq, u8) +VMLA_IMPL(int8x16_t, int8x16_t, vmlaq, s8) +VMLA_IMPL(uint16x8_t, uint16x8_t, vmlaq, u16) +VMLA_IMPL(int16x8_t, int16x8_t, vmlaq, s16) +VMLA_IMPL(uint32x4_t, uint32x4_t, vmlaq, u32) +VMLA_IMPL(int32x4_t, int32x4_t, vmlaq, s32) +VMLA_IMPL(float32x4_t, float32x4_t, vmlaq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMLA_IMPL2(float16x8_t, float16x8_t, vaddq, vmulq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VMLA_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_MLA_H */ diff --git a/src/core/NEON/wrapper/intrinsics/movl.h b/src/core/NEON/wrapper/intrinsics/movl.h new file mode 100644 index 0000000000..99f2150eab --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/movl.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_MOVL_H +#define ARM_COMPUTE_WRAPPER_MOVL_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VMOVL_IMPL(ptype, vtype, prefix, postfix) \ + inline ptype vmovl(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VMOVL_IMPL(uint16x8_t, uint8x8_t, vmovl, u8) +VMOVL_IMPL(int16x8_t, int8x8_t, vmovl, s8) +VMOVL_IMPL(uint32x4_t, uint16x4_t, vmovl, u16) +VMOVL_IMPL(int32x4_t, int16x4_t, vmovl, s16) +VMOVL_IMPL(uint64x2_t, uint32x2_t, vmovl, u32) +VMOVL_IMPL(int64x2_t, int32x2_t, vmovl, s32) + +#undef VMOVL_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_MOVL_H */ diff --git a/src/core/NEON/wrapper/intrinsics/movn.h b/src/core/NEON/wrapper/intrinsics/movn.h new file mode 100644 index 0000000000..460c277540 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/movn.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_MOVN_H +#define ARM_COMPUTE_WRAPPER_MOVN_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VMOVN_IMPL(dtype, vtype, prefix, postfix) \ + inline dtype vmovn(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VMOVN_IMPL(uint32x2_t, uint64x2_t, vmovn, u64) +VMOVN_IMPL(int32x2_t, int64x2_t, vmovn, s64) +VMOVN_IMPL(uint16x4_t, uint32x4_t, vmovn, u32) +VMOVN_IMPL(int16x4_t, int32x4_t, vmovn, s32) +VMOVN_IMPL(uint8x8_t, uint16x8_t, vmovn, u16) +VMOVN_IMPL(int8x8_t, int16x8_t, vmovn, s16) + +#define VQMOVN_IMPL(dtype, vtype, prefix, postfix) \ + inline dtype vqmovn(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VQMOVN_IMPL(uint32x2_t, uint64x2_t, vqmovn, u64) +VQMOVN_IMPL(int32x2_t, int64x2_t, vqmovn, s64) +VQMOVN_IMPL(uint16x4_t, uint32x4_t, vqmovn, u32) +VQMOVN_IMPL(int16x4_t, int32x4_t, vqmovn, s32) +VQMOVN_IMPL(uint8x8_t, uint16x8_t, vqmovn, u16) +VQMOVN_IMPL(int8x8_t, int16x8_t, vqmovn, s16) + +#undef VMOVN_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_MOVN_H */ diff --git a/src/core/NEON/wrapper/intrinsics/mul.h b/src/core/NEON/wrapper/intrinsics/mul.h new file mode 100644 index 0000000000..6296fff35a --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/mul.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_MUL_H +#define ARM_COMPUTE_WRAPPER_MUL_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VMUL_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vmul(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VMUL_IMPL(uint8x8_t, uint8x8_t, vmul, u8) +VMUL_IMPL(int8x8_t, int8x8_t, vmul, s8) +VMUL_IMPL(uint16x4_t, uint16x4_t, vmul, u16) +VMUL_IMPL(int16x4_t, int16x4_t, vmul, s16) +VMUL_IMPL(uint32x2_t, uint32x2_t, vmul, u32) +VMUL_IMPL(int32x2_t, int32x2_t, vmul, s32) +VMUL_IMPL(float32x2_t, float32x2_t, vmul, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMUL_IMPL(float16_t, float16x4_t, vmul, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VMUL_IMPL(uint8_t, uint8x16_t, vmulq, u8) +VMUL_IMPL(int8_t, int8x16_t, vmulq, s8) +VMUL_IMPL(uint16_t, uint16x8_t, vmulq, u16) +VMUL_IMPL(int16_t, int16x8_t, vmulq, s16) +VMUL_IMPL(uint32_t, uint32x4_t, vmulq, u32) +VMUL_IMPL(int32_t, int32x4_t, vmulq, s32) +VMUL_IMPL(float32x4_t, float32x4_t, vmulq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMUL_IMPL(float16_t, float16x8_t, vmulq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VMUL_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_MUL_H */ diff --git a/src/core/NEON/wrapper/intrinsics/neg.h b/src/core/NEON/wrapper/intrinsics/neg.h new file mode 100644 index 0000000000..5e4556664e --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/neg.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_NEG_H +#define ARM_COMPUTE_WRAPPER_NEG_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VNEG_IMPL(vtype, prefix, postfix) \ + inline vtype vneg(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VNEG_IMPL(int8x8_t, vneg, s8) +VNEG_IMPL(int16x4_t, vneg, s16) +VNEG_IMPL(int32x2_t, vneg, s32) +VNEG_IMPL(float32x2_t, vneg, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VNEG_IMPL(float16x4_t, vneg, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VNEG_IMPL(int8x16_t, vnegq, s8) +VNEG_IMPL(int16x8_t, vnegq, s16) +VNEG_IMPL(int32x4_t, vnegq, s32) +VNEG_IMPL(float32x4_t, vnegq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VNEG_IMPL(float16x8_t, vnegq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VNEG_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_NEG_H */ diff --git a/src/core/NEON/wrapper/intrinsics/not.h b/src/core/NEON/wrapper/intrinsics/not.h new file mode 100644 index 0000000000..5853e849a2 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/not.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_NOT_H +#define ARM_COMPUTE_WRAPPER_NOT_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VNOT_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vnot(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VNOT_IMPL(uint8_t, uint8x8_t, vmvn, u8) +VNOT_IMPL(int8_t, int8x8_t, vmvn, s8) +VNOT_IMPL(uint16_t, uint16x4_t, vmvn, u16) +VNOT_IMPL(int16_t, int16x4_t, vmvn, s16) +VNOT_IMPL(uint32_t, uint32x2_t, vmvn, u32) +VNOT_IMPL(int32_t, int32x2_t, vmvn, s32) +VNOT_IMPL(float32x2_t, float32x2_t, vinv, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VNOT_IMPL(float16x4_t, float16x4_t, vinv, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VNOT_IMPL(uint8_t, uint8x16_t, vmvnq, u8) +VNOT_IMPL(int8_t, int8x16_t, vmvnq, s8) +VNOT_IMPL(uint16_t, uint16x8_t, vmvnq, u16) +VNOT_IMPL(int16_t, int16x8_t, vmvnq, s16) +VNOT_IMPL(uint32_t, uint32x4_t, vmvnq, u32) +VNOT_IMPL(int32_t, int32x4_t, vmvnq, s32) +VNOT_IMPL(float32x4_t, float32x4_t, vinvq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VNOT_IMPL(float16x8_t, float16x8_t, vinvq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VNOT_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_NOT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/orr.h b/src/core/NEON/wrapper/intrinsics/orr.h new file mode 100644 index 0000000000..cc83e95d15 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/orr.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_ORR_H +#define ARM_COMPUTE_WRAPPER_ORR_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VORR_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vorr(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VORR_IMPL(uint8_t, uint8x8_t, vorr, u8) +VORR_IMPL(int8_t, int8x8_t, vorr, s8) +VORR_IMPL(uint16_t, uint16x4_t, vorr, u16) +VORR_IMPL(int16_t, int16x4_t, vorr, s16) +VORR_IMPL(uint32_t, uint32x2_t, vorr, u32) +VORR_IMPL(int32_t, int32x2_t, vorr, s32) +VORR_IMPL(uint64_t, uint64x1_t, vorr, u64) +VORR_IMPL(int64_t, int64x1_t, vorr, s64) + +VORR_IMPL(uint8_t, uint8x16_t, vorrq, u8) +VORR_IMPL(int8_t, int8x16_t, vorrq, s8) +VORR_IMPL(uint16_t, uint16x8_t, vorrq, u16) +VORR_IMPL(int16_t, int16x8_t, vorrq, s16) +VORR_IMPL(uint32_t, uint32x4_t, vorrq, u32) +VORR_IMPL(int32_t, int32x4_t, vorrq, s32) +VORR_IMPL(uint64_t, uint64x2_t, vorrq, u64) +VORR_IMPL(int64_t, int64x2_t, vorrq, s64) + +#undef VORR_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_ORR_H */ diff --git a/src/core/NEON/wrapper/intrinsics/pmax.h b/src/core/NEON/wrapper/intrinsics/pmax.h new file mode 100644 index 0000000000..cd2b2d1f41 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/pmax.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_PMAX_H +#define ARM_COMPUTE_WRAPPER_PMAX_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VPMAX_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vpmax(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VPMAX_IMPL(uint8_t, uint8x8_t, vpmax, u8) +VPMAX_IMPL(int8_t, int8x8_t, vpmax, s8) +VPMAX_IMPL(uint16_t, uint16x4_t, vpmax, u16) +VPMAX_IMPL(int16_t, int16x4_t, vpmax, s16) +VPMAX_IMPL(uint32_t, uint32x2_t, vpmax, u32) +VPMAX_IMPL(int32_t, int32x2_t, vpmax, s32) +VPMAX_IMPL(float, float32x2_t, vpmax, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VPMAX_IMPL(float16_t, float16x4_t, vpmax, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VPMAX_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_PMAX_H */ diff --git a/src/core/NEON/wrapper/intrinsics/pmin.h b/src/core/NEON/wrapper/intrinsics/pmin.h new file mode 100644 index 0000000000..59b6be69ce --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/pmin.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_PMIN_H +#define ARM_COMPUTE_WRAPPER_PMIN_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VPMIN_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vpmin(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VPMIN_IMPL(uint8_t, uint8x8_t, vpmin, u8) +VPMIN_IMPL(int8_t, int8x8_t, vpmin, s8) +VPMIN_IMPL(uint16_t, uint16x4_t, vpmin, u16) +VPMIN_IMPL(int16_t, int16x4_t, vpmin, s16) +VPMIN_IMPL(uint32_t, uint32x2_t, vpmin, u32) +VPMIN_IMPL(int32_t, int32x2_t, vpmin, s32) +VPMIN_IMPL(float, float32x2_t, vpmin, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VPMIN_IMPL(float16_t, float16x4_t, vpmin, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VPMIN_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_PMIN_H */ diff --git a/src/core/NEON/wrapper/intrinsics/pow.h b/src/core/NEON/wrapper/intrinsics/pow.h new file mode 100644 index 0000000000..61f834ed23 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/pow.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_POW_H +#define ARM_COMPUTE_WRAPPER_POW_H + +#include "src/core/NEON/NEMath.h" +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VPOW_IMPL(vtype, prefix, postfix) \ + inline vtype vpow(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VPOW_IMPL(float32x4_t, vpowq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VPOW_IMPL(float16x8_t, vpowq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VPOW_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_POW_H */ diff --git a/src/core/NEON/wrapper/intrinsics/qmov.h b/src/core/NEON/wrapper/intrinsics/qmov.h new file mode 100644 index 0000000000..167f3cf43b --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/qmov.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_QMOV_H +#define ARM_COMPUTE_WRAPPER_QMOV_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +template +inline typename std::enable_if::value, uint8x8_t>::type +vqmov(const int16x8_t &a) +{ + return vqmovun_s16(a); +} + +template +inline typename std::enable_if::value, int8x8_t>::type +vqmov(const int16x8_t &a) +{ + return vqmovn_s16(a); +} + +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_QMOV_H */ diff --git a/src/core/NEON/wrapper/intrinsics/qmovun.h b/src/core/NEON/wrapper/intrinsics/qmovun.h new file mode 100644 index 0000000000..f823ddb513 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/qmovun.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_QMOVUN_H +#define ARM_COMPUTE_WRAPPER_QMOVUN_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VQMOVUN_IMPL(dtype, vtype, prefix, postfix) \ + inline dtype vqmovun(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VQMOVUN_IMPL(uint32x2_t, int64x2_t, vqmovun, s64) +VQMOVUN_IMPL(uint16x4_t, int32x4_t, vqmovun, s32) +VQMOVUN_IMPL(uint8x8_t, int16x8_t, vqmovun, s16) + +#undef VQMOVUN_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_QMOVUN_H */ diff --git a/src/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h new file mode 100644 index 0000000000..0c26cd9008 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/reinterpret.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_REINTERPRET_H +#define ARM_COMPUTE_WRAPPER_REINTERPRET_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VREINTERPRET_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ + inline ptype vreinterpret(const vtype &a) \ + { \ + return prefix##_##postfix1##_##postfix2(a); \ + } \ + \ + inline ptype vreinterpret(const ptype &a) \ + { \ + return a; \ + } + +VREINTERPRET_IMPL(int16x4_t, uint16x4_t, vreinterpret, s16, u16) + +VREINTERPRET_IMPL(int32x4_t, uint32x4_t, vreinterpretq, s32, u32) +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_REINTERPRET_H */ diff --git a/src/core/NEON/wrapper/intrinsics/rev64.h b/src/core/NEON/wrapper/intrinsics/rev64.h new file mode 100644 index 0000000000..0f0139c93b --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/rev64.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_REV64_H +#define ARM_COMPUTE_WRAPPER_REV64_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VREV64_IMPL(vtype, prefix, postfix) \ + inline vtype vrev64(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VREV64_IMPL(uint8x8_t, vrev64, u8) +VREV64_IMPL(int8x8_t, vrev64, s8) +VREV64_IMPL(uint16x4_t, vrev64, u16) +VREV64_IMPL(int16x4_t, vrev64, s16) +VREV64_IMPL(uint32x2_t, vrev64, u32) +VREV64_IMPL(int32x2_t, vrev64, s32) +VREV64_IMPL(float32x2_t, vrev64, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VREV64_IMPL(float16x4_t, vrev64, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VREV64_IMPL(uint8x16_t, vrev64q, u8) +VREV64_IMPL(int8x16_t, vrev64q, s8) +VREV64_IMPL(uint16x8_t, vrev64q, u16) +VREV64_IMPL(int16x8_t, vrev64q, s16) +VREV64_IMPL(uint32x4_t, vrev64q, u32) +VREV64_IMPL(int32x4_t, vrev64q, s32) +VREV64_IMPL(float32x4_t, vrev64q, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VREV64_IMPL(float16x8_t, vrev64q, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VREV64_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_REV64_H */ diff --git a/src/core/NEON/wrapper/intrinsics/round.h b/src/core/NEON/wrapper/intrinsics/round.h new file mode 100644 index 0000000000..d23feb6b42 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/round.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_ROUND_H +#define ARM_COMPUTE_WRAPPER_ROUND_H + +#include "src/core/NEON/NEMath.h" +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VROUNDQ_IMPL(vtype, postfix) \ + inline vtype vround(const vtype &a) \ + { \ + return vroundq_rte_##postfix(a); \ + } + +#define VROUNDQ_IMPL_INT(vtype, postfix) \ + inline vtype vround(const vtype &a) \ + { \ + ARM_COMPUTE_UNUSED(a); \ + ARM_COMPUTE_ERROR("Not supported"); \ + } + +VROUNDQ_IMPL(float32x4_t, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VROUNDQ_IMPL(float16x8_t, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VROUNDQ_IMPL_INT(int32x4_t, s32) +#undef VROUNDQ_IMPL + +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_ROUND_H */ diff --git a/src/core/NEON/wrapper/intrinsics/setlane.h b/src/core/NEON/wrapper/intrinsics/setlane.h new file mode 100644 index 0000000000..197eedacb5 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/setlane.h @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_SET_LANE_H +#define ARM_COMPUTE_WRAPPER_SET_LANE_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VSETLANE_IMPL_8(stype, atype, vtype, postfix) \ + inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vset_lane_##postfix(value, vector, 0); \ + case 1: \ + return vset_lane_##postfix(value, vector, 1); \ + case 2: \ + return vset_lane_##postfix(value, vector, 2); \ + case 3: \ + return vset_lane_##postfix(value, vector, 3); \ + case 4: \ + return vset_lane_##postfix(value, vector, 4); \ + case 5: \ + return vset_lane_##postfix(value, vector, 5); \ + case 6: \ + return vset_lane_##postfix(value, vector, 6); \ + case 7: \ + return vset_lane_##postfix(value, vector, 7); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +#define VSETLANE_IMPL_4(stype, atype, vtype, postfix) \ + inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vset_lane_##postfix(value, vector, 0); \ + case 1: \ + return vset_lane_##postfix(value, vector, 1); \ + case 2: \ + return vset_lane_##postfix(value, vector, 2); \ + case 3: \ + return vset_lane_##postfix(value, vector, 3); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +#define VSETLANE_IMPL_2(stype, atype, vtype, postfix) \ + inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vset_lane_##postfix(value, vector, 0); \ + case 1: \ + return vset_lane_##postfix(value, vector, 1); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +VSETLANE_IMPL_8(uint8x8_t, uint8_t, uint8x8_t, u8) +VSETLANE_IMPL_8(int8x8_t, int8_t, int8x8_t, s8) +VSETLANE_IMPL_4(uint16x4_t, uint16_t, uint16x4_t, u16) +VSETLANE_IMPL_4(int16x4_t, int16_t, int16x4_t, s16) +VSETLANE_IMPL_2(uint32x2_t, uint32_t, uint32x2_t, u32) +VSETLANE_IMPL_2(int32x2_t, int32_t, int32x2_t, s32) +VSETLANE_IMPL_2(float32x2_t, float, float32x2_t, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#define VSETQLANE_IMPL_16(stype, atype, vtype, postfix) \ + inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vsetq_lane_##postfix(value, vector, 0); \ + case 1: \ + return vsetq_lane_##postfix(value, vector, 1); \ + case 2: \ + return vsetq_lane_##postfix(value, vector, 2); \ + case 3: \ + return vsetq_lane_##postfix(value, vector, 3); \ + case 4: \ + return vsetq_lane_##postfix(value, vector, 4); \ + case 5: \ + return vsetq_lane_##postfix(value, vector, 5); \ + case 6: \ + return vsetq_lane_##postfix(value, vector, 6); \ + case 7: \ + return vsetq_lane_##postfix(value, vector, 7); \ + case 8: \ + return vsetq_lane_##postfix(value, vector, 8); \ + case 9: \ + return vsetq_lane_##postfix(value, vector, 9); \ + case 10: \ + return vsetq_lane_##postfix(value, vector, 10); \ + case 11: \ + return vsetq_lane_##postfix(value, vector, 11); \ + case 12: \ + return vsetq_lane_##postfix(value, vector, 12); \ + case 13: \ + return vsetq_lane_##postfix(value, vector, 13); \ + case 14: \ + return vsetq_lane_##postfix(value, vector, 14); \ + case 15: \ + return vsetq_lane_##postfix(value, vector, 15); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +#define VSETQLANE_IMPL_8(stype, atype, vtype, postfix) \ + inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vsetq_lane_##postfix(value, vector, 0); \ + case 1: \ + return vsetq_lane_##postfix(value, vector, 1); \ + case 2: \ + return vsetq_lane_##postfix(value, vector, 2); \ + case 3: \ + return vsetq_lane_##postfix(value, vector, 3); \ + case 4: \ + return vsetq_lane_##postfix(value, vector, 4); \ + case 5: \ + return vsetq_lane_##postfix(value, vector, 5); \ + case 6: \ + return vsetq_lane_##postfix(value, vector, 6); \ + case 7: \ + return vsetq_lane_##postfix(value, vector, 7); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +#define VSETQLANE_IMPL_4(stype, atype, vtype, postfix) \ + inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ + { \ + switch(lane) \ + { \ + case 0: \ + return vsetq_lane_##postfix(value, vector, 0); \ + case 1: \ + return vsetq_lane_##postfix(value, vector, 1); \ + case 2: \ + return vsetq_lane_##postfix(value, vector, 2); \ + case 3: \ + return vsetq_lane_##postfix(value, vector, 3); \ + default: \ + ARM_COMPUTE_ERROR("Invalid lane"); \ + } \ + } + +VSETQLANE_IMPL_16(uint8x16_t, uint8_t, uint8x16_t, u8) +VSETQLANE_IMPL_16(int8x16_t, int8_t, int8x16_t, s8) +VSETQLANE_IMPL_8(uint16x8_t, uint16_t, uint16x8_t, u16) +VSETQLANE_IMPL_8(int16x8_t, int16_t, int16x8_t, s16) +VSETQLANE_IMPL_4(uint32x4_t, uint32_t, uint32x4_t, u32) +VSETQLANE_IMPL_4(int32x4_t, int32_t, int32x4_t, s32) +VSETQLANE_IMPL_4(float32x4_t, float, float32x4_t, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VSETQLANE_IMPL_8(float16x8_t, float16_t, float16x8_t, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VSETLANE_IMPL_8 +#undef VSETLANE_IMPL_4 +#undef VSETLANE_IMPL_2 + +#undef VSETQLANE_IMPL_16 +#undef VSETQLANE_IMPL_8 +#undef VSETQLANE_IMPL_4 +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_SET_LANE_H */ diff --git a/src/core/NEON/wrapper/intrinsics/sin.h b/src/core/NEON/wrapper/intrinsics/sin.h new file mode 100644 index 0000000000..03c2813a32 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/sin.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_SIN_H +#define ARM_COMPUTE_WRAPPER_SIN_H + +#include "src/core/NEON/NEMath.h" +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VSIN_IMPL(vtype, prefix, postfix) \ + inline vtype vsin(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +#define VSIN_IMPL_INT(vtype, prefix, postfix) \ + inline vtype vsin(const vtype &a) \ + { \ + ARM_COMPUTE_UNUSED(a); \ + ARM_COMPUTE_ERROR("Not supported"); \ + } + +VSIN_IMPL(float32x4_t, vsinq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VSIN_IMPL(float16x8_t, vsinq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VSIN_IMPL_INT(int32x4_t, vsinq, s32) + +#undef vsub_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_SUB_H */ \ No newline at end of file diff --git a/src/core/NEON/wrapper/intrinsics/store.h b/src/core/NEON/wrapper/intrinsics/store.h new file mode 100644 index 0000000000..6dda432ea9 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/store.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_STORE_H +#define ARM_COMPUTE_WRAPPER_STORE_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VSTORE_IMPL(stype, vtype, prefix, postfix) \ + inline void vstore(stype *ptr, vtype val) \ + { \ + prefix##_##postfix(ptr, val); \ + } + +VSTORE_IMPL(uint8_t, uint8x8_t, vst1, u8) +VSTORE_IMPL(uint8_t, uint8x8x2_t, vst2, u8) +VSTORE_IMPL(int8_t, int8x8_t, vst1, s8) +VSTORE_IMPL(int8_t, int8x8x2_t, vst2, s8) +VSTORE_IMPL(uint16_t, uint16x4_t, vst1, u16) +VSTORE_IMPL(int16_t, int16x4_t, vst1, s16) +VSTORE_IMPL(uint32_t, uint32x2_t, vst1, u32) +VSTORE_IMPL(int32_t, int32x2_t, vst1, s32) +//VSTORE_IMPL(uint64_t, 1, vst1, u64) +//VSTORE_IMPL(int64_t, 1, vst1, s64) +VSTORE_IMPL(float, float32x2_t, vst1, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VSTORE_IMPL(float16_t, float16x4_t, vst1, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VSTORE_IMPL(uint8_t, uint8x16_t, vst1q, u8) +VSTORE_IMPL(int8_t, int8x16_t, vst1q, s8) +VSTORE_IMPL(uint16_t, uint16x8_t, vst1q, u16) +VSTORE_IMPL(int16_t, int16x8_t, vst1q, s16) +VSTORE_IMPL(uint32_t, uint32x4_t, vst1q, u32) +VSTORE_IMPL(int32_t, int32x4_t, vst1q, s32) +//VSTORE_IMPL(uint64_t, 2, vst1q, u64) +//VSTORE_IMPL(int64_t, 2, vst1q, s64) +VSTORE_IMPL(float, float32x4_t, vst1q, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VSTORE_IMPL(float16_t, float16x8_t, vst1q, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VSTORE_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_STORE_H */ diff --git a/src/core/NEON/wrapper/intrinsics/sub.h b/src/core/NEON/wrapper/intrinsics/sub.h new file mode 100644 index 0000000000..475986d0f6 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/sub.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_SUB_H +#define ARM_COMPUTE_WRAPPER_SUB_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VSUB_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vsub(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VSUB_IMPL(uint8x8_t, uint8x8_t, vsub, u8) +VSUB_IMPL(int8x8_t, int8x8_t, vsub, s8) +VSUB_IMPL(uint16x4_t, uint16x4_t, vsub, u16) +VSUB_IMPL(int16x4_t, int16x4_t, vsub, s16) +VSUB_IMPL(uint32x2_t, uint32x2_t, vsub, u32) +VSUB_IMPL(int32x2_t, int32x2_t, vsub, s32) +VSUB_IMPL(uint64x1_t, uint64x1_t, vsub, u64) +VSUB_IMPL(int64x1_t, int64x1_t, vsub, s64) +VSUB_IMPL(float32x2_t, float32x2_t, vsub, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VSUB_IMPL(float16x4_t, float16x4_t, vsub, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VSUB_IMPL(uint8x16_t, uint8x16_t, vsubq, u8) +VSUB_IMPL(int8x16_t, int8x16_t, vsubq, s8) +VSUB_IMPL(uint16x8_t, uint16x8_t, vsubq, u16) +VSUB_IMPL(int16x8_t, int16x8_t, vsubq, s16) +VSUB_IMPL(uint32x4_t, uint32x4_t, vsubq, u32) +VSUB_IMPL(int32x4_t, int32x4_t, vsubq, s32) +VSUB_IMPL(uint64x2_t, uint64x2_t, vsubq, u64) +VSUB_IMPL(int64x2_t, int64x2_t, vsubq, s64) +VSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VSUB_IMPL + +// VQSUB: Vector saturating sub (No notion of saturation for floating point) +#define VQSUB_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vqsub(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VQSUB_IMPL(uint8x8_t, uint8x8_t, vqsub, u8) +VQSUB_IMPL(int8x8_t, int8x8_t, vqsub, s8) +VQSUB_IMPL(uint16x4_t, uint16x4_t, vqsub, u16) +VQSUB_IMPL(int16x4_t, int16x4_t, vqsub, s16) +VQSUB_IMPL(uint32x2_t, uint32x2_t, vqsub, u32) +VQSUB_IMPL(int32x2_t, int32x2_t, vqsub, s32) +VQSUB_IMPL(uint64x1_t, uint64x1_t, vqsub, u64) +VQSUB_IMPL(int64x1_t, int64x1_t, vqsub, s64) +VQSUB_IMPL(float32x2_t, float32x2_t, vsub, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VQSUB_IMPL(float16x4_t, float16x4_t, vsub, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VQSUB_IMPL(uint8x16_t, uint8x16_t, vqsubq, u8) +VQSUB_IMPL(int8x16_t, int8x16_t, vqsubq, s8) +VQSUB_IMPL(uint16x8_t, uint16x8_t, vqsubq, u16) +VQSUB_IMPL(int16x8_t, int16x8_t, vqsubq, s16) +VQSUB_IMPL(uint32x4_t, uint32x4_t, vqsubq, u32) +VQSUB_IMPL(int32x4_t, int32x4_t, vqsubq, s32) +VQSUB_IMPL(uint64x2_t, uint64x2_t, vqsubq, u64) +VQSUB_IMPL(int64x2_t, int64x2_t, vqsubq, s64) +VQSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VQSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#undef VQSUB_IMPL + +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_SUB_H */ diff --git a/src/core/NEON/wrapper/intrinsics/tanh.h b/src/core/NEON/wrapper/intrinsics/tanh.h new file mode 100644 index 0000000000..daeaf19997 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/tanh.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_TANH_H +#define ARM_COMPUTE_WRAPPER_TANH_H + +#include "src/core/NEON/NEMath.h" +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VTANH_IMPL(vtype, prefix, postfix) \ + inline vtype vtanh(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VTANH_IMPL(float32x4_t, vtanhq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VTANH_IMPL(float16x8_t, vtanhq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#undef VTANH_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_TANH_H */ diff --git a/src/core/NEON/wrapper/intrinsics/tbl.h b/src/core/NEON/wrapper/intrinsics/tbl.h new file mode 100644 index 0000000000..05e6c1fc13 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/tbl.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_TBL_H +#define ARM_COMPUTE_WRAPPER_TBL_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VTBL_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vtbl(const stype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VTBL_IMPL(uint8x8x2_t, uint8x8_t, vtbl2, u8) +VTBL_IMPL(int8x8x2_t, int8x8_t, vtbl2, s8) + +#undef VTBL_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_TBL_H */ diff --git a/src/core/NEON/wrapper/scalar/add.h b/src/core/NEON/wrapper/scalar/add.h new file mode 100644 index 0000000000..642d9261f3 --- /dev/null +++ b/src/core/NEON/wrapper/scalar/add.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_SCALAR_ADD_H +#define ARM_COMPUTE_WRAPPER_SCALAR_ADD_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +inline uint8_t add_sat(const uint8_t &a, const uint8_t &b) +{ + const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 }; + const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 }; + return vget_lane_u8(vqadd_u8(va, vb), 0); +} + +inline int16_t add_sat(const int16_t &a, const int16_t &b) +{ + const int16x4_t va = { a, 0, 0, 0 }; + const int16x4_t vb = { b, 0, 0, 0 }; + return vget_lane_s16(vqadd_s16(va, vb), 0); +} + +inline int32_t add_sat(const int32_t &a, const int32_t &b) +{ + const int32x2_t va = { a, 0 }; + const int32x2_t vb = { b, 0 }; + return vget_lane_s32(vqadd_s32(va, vb), 0); +} + +inline float add_sat(const float &a, const float &b) +{ + // No notion of saturation exists in floating point + return a + b; +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline float16_t add_sat(const float16_t &a, const float16_t &b) +{ + // No notion of saturation exists in floating point + return a + b; +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_SCALAR_ADD_H */ diff --git a/src/core/NEON/wrapper/scalar/scalar.h b/src/core/NEON/wrapper/scalar/scalar.h new file mode 100644 index 0000000000..8be37e55ba --- /dev/null +++ b/src/core/NEON/wrapper/scalar/scalar.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_SCALAR_H +#define ARM_COMPUTE_WRAPPER_SCALAR_H + +#include "src/core/NEON/wrapper/scalar/add.h" +#include "src/core/NEON/wrapper/scalar/sub.h" + +#endif /* ARM_COMPUTE_WRAPPER_SCALAR_H */ diff --git a/src/core/NEON/wrapper/scalar/sub.h b/src/core/NEON/wrapper/scalar/sub.h new file mode 100644 index 0000000000..1fe51d75fc --- /dev/null +++ b/src/core/NEON/wrapper/scalar/sub.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_SCALAR_SUB_H +#define ARM_COMPUTE_WRAPPER_SCALAR_SUB_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b) +{ + const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 }; + const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 }; + return vget_lane_u8(vqsub_u8(va, vb), 0); +} + +inline int16_t sub_sat(const int16_t &a, const int16_t &b) +{ + const int16x4_t va = { a, 0, 0, 0 }; + const int16x4_t vb = { b, 0, 0, 0 }; + return vget_lane_s16(vqsub_s16(va, vb), 0); +} + +inline int32_t sub_sat(const int32_t &a, const int32_t &b) +{ + const int32x2_t va = { a, 0 }; + const int32x2_t vb = { b, 0 }; + return vget_lane_s32(vqsub_s32(va, vb), 0); +} + +inline float sub_sat(const float &a, const float &b) +{ + // No notion of saturation exists in floating point + return a - b; +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline float16_t sub_sat(const float16_t &a, const float16_t &b) +{ + // No notion of saturation exists in floating point + return a - b; +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_SCALAR_SUB_H */ diff --git a/src/core/NEON/wrapper/traits.h b/src/core/NEON/wrapper/traits.h new file mode 100644 index 0000000000..eafbeef372 --- /dev/null +++ b/src/core/NEON/wrapper/traits.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_TRAITS_H +#define ARM_COMPUTE_WRAPPER_TRAITS_H + +#include + +namespace arm_compute +{ +namespace wrapper +{ +namespace traits +{ +// *INDENT-OFF* +// clang-format off + +/** 64-bit vector tag */ +struct vector_64_tag {}; +/** 128-bit vector tag */ +struct vector_128_tag {}; + +/** Create the appropriate NEON vector given its type and size in terms of elements */ +template struct neon_vector; + +// Specializations +#ifndef DOXYGEN_SKIP_THIS +template <> struct neon_vector{ using scalar_type = uint8_t; using type = uint8x8_t; using tag_type = vector_64_tag; }; +template <> struct neon_vector{ using scalar_type = int8_t; using type = int8x8_t; using tag_type = vector_64_tag; }; +template <> struct neon_vector{ using scalar_type = uint8_t; using type = uint8x16_t; using tag_type = vector_128_tag; }; +template <> struct neon_vector{ using scalar_type = int8_t; using type = int8x16_t; using tag_type = vector_128_tag; }; +template <> struct neon_vector{ using scalar_type = uint16_t; using type = uint16x4_t; using tag_type = vector_64_tag; }; +template <> struct neon_vector{ using scalar_type = int16_t; using type = int16x4_t; using tag_type = vector_64_tag; }; +template <> struct neon_vector{ using scalar_type = uint16_t; using type = uint16x8_t; using tag_type = vector_128_tag; }; +template <> struct neon_vector{ using scalar_type = uint16_t; using type = uint16x8x2_t; }; +template <> struct neon_vector{ using scalar_type = int16_t; using type = int16x8_t; using tag_type = vector_128_tag; }; +template <> struct neon_vector{ using scalar_type = int16_t; using type = int16x8x2_t; }; +template <> struct neon_vector{ using scalar_type = uint32_t; using type = uint32x2_t; using tag_type = vector_64_tag; }; +template <> struct neon_vector{ using scalar_type = int32_t; using type = int32x2_t; using tag_type = vector_64_tag; }; +template <> struct neon_vector{ using scalar_type = uint32_t; using type = uint32x4_t; using tag_type = vector_128_tag; }; +template <> struct neon_vector{ using scalar_type = int32_t; using type = int32x4_t; using tag_type = vector_128_tag; }; +template <> struct neon_vector{ using scalar_type = uint64_t;using type = uint64x1_t; using tag_type = vector_64_tag; }; +template <> struct neon_vector{ using scalar_type = int64_t; using type = int64x1_t; using tag_type = vector_64_tag; }; +template <> struct neon_vector{ using scalar_type = uint64_t; using type = uint64x2_t; using tag_type = vector_128_tag; }; +template <> struct neon_vector{ using scalar_type = int64_t; using type = int64x2_t; using tag_type = vector_128_tag; }; +template <> struct neon_vector{ using scalar_type = float_t; using type = float32x2_t; using tag_type = vector_64_tag; }; +template <> struct neon_vector{ using scalar_type = float_t; using type = float32x4_t; using tag_type = vector_128_tag; }; + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> struct neon_vector{ using scalar_type = float16_t; using type = float16x4_t; using tag_type = vector_64_tag; }; +template <> struct neon_vector{ using scalar_type = float16_t; using type = float16x8_t; using tag_type = vector_128_tag; }; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#endif /* DOXYGEN_SKIP_THIS */ + +/** Helper type template to get the type of a neon vector */ +template using neon_vector_t = typename neon_vector::type; +/** Helper type template to get the tag type of a neon vector */ +template using neon_vector_tag_t = typename neon_vector::tag_type; + +/** Vector bit-width enum class */ +enum class BitWidth +{ + W64, /**< 64-bit width */ + W128, /**< 128-bit width */ +}; + +/** Create the appropriate NEON vector given its type and size in terms of bits */ +template struct neon_bitvector; +// Specializations +#ifndef DOXYGEN_SKIP_THIS +template <> struct neon_bitvector{ using type = uint8x8_t; using tag_type = vector_64_tag; }; +template <> struct neon_bitvector{ using type = int8x8_t; using tag_type = vector_64_tag; }; +template <> struct neon_bitvector{ using type = uint8x16_t; using tag_type = vector_128_tag; }; +template <> struct neon_bitvector{ using type = int8x16_t; using tag_type = vector_128_tag; }; +template <> struct neon_bitvector{ using type = uint16x4_t; using tag_type = vector_64_tag; }; +template <> struct neon_bitvector{ using type = int16x4_t; using tag_type = vector_64_tag; }; +template <> struct neon_bitvector{ using type = uint16x8_t; using tag_type = vector_128_tag; }; +template <> struct neon_bitvector{ using type = int16x8_t; using tag_type = vector_128_tag; }; +template <> struct neon_bitvector{ using type = uint32x2_t; using tag_type = vector_64_tag; }; +template <> struct neon_bitvector{ using type = int32x2_t; using tag_type = vector_64_tag; }; +template <> struct neon_bitvector{ using type = uint32x4_t; using tag_type = vector_128_tag; }; +template <> struct neon_bitvector{ using type = int32x4_t; using tag_type = vector_128_tag; }; +template <> struct neon_bitvector{ using type = uint64x1_t; using tag_type = vector_64_tag; }; +template <> struct neon_bitvector{ using type = int64x1_t; using tag_type = vector_64_tag; }; +template <> struct neon_bitvector{ using type = uint64x2_t; using tag_type = vector_128_tag; }; +template <> struct neon_bitvector{ using type = int64x2_t; using tag_type = vector_128_tag; }; +template <> struct neon_bitvector{ using type = float32x2_t; using tag_type = vector_64_tag; }; +template <> struct neon_bitvector{ using type = float32x4_t; using tag_type = vector_128_tag; }; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> struct neon_bitvector{ using type = float16x4_t; using tag_type = vector_64_tag; }; +template <> struct neon_bitvector{ using type = float16x8_t; using tag_type = vector_128_tag; }; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#endif /* DOXYGEN_SKIP_THIS */ + +/** Helper type template to get the type of a neon vector */ +template using neon_bitvector_t = typename neon_bitvector::type; +/** Helper type template to get the tag type of a neon vector */ +template using neon_bitvector_tag_t = typename neon_bitvector::tag_type; + +/** Promote a type */ +template struct promote { }; +template <> struct promote { using type = uint16_t; }; +template <> struct promote { using type = int16_t; }; +template <> struct promote { using type = uint32_t; }; +template <> struct promote { using type = int32_t; }; +template <> struct promote { using type = uint64_t; }; +template <> struct promote { using type = int64_t; }; +template <> struct promote { using type = float; }; +template <> struct promote { using type = half; }; + +/** Get promoted type */ +template +using promote_t = typename promote::type; + +// clang-format on +// *INDENT-ON* +} // namespace traits +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_TRAITS_H */ diff --git a/src/core/NEON/wrapper/wrapper.h b/src/core/NEON/wrapper/wrapper.h new file mode 100644 index 0000000000..e5467e98ff --- /dev/null +++ b/src/core/NEON/wrapper/wrapper.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_H +#define ARM_COMPUTE_WRAPPER_H + +// Traits +#include "src/core/NEON/wrapper/traits.h" + +// Intrinsics Overloads +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/NEON/wrapper/scalar/scalar.h" + +#endif /* ARM_COMPUTE_WRAPPER_H */ -- cgit v1.2.1