From 46d44d26183d835d209d7ef1b9023e217dd4019d Mon Sep 17 00:00:00 2001 From: Yair Schwarzbaum Date: Wed, 12 Jan 2022 16:38:58 +0200 Subject: Enable kernel selection testing (Phase #2) Resolves COMPMID-4987 Change-Id: I1201ca3eae107989d13b6a2c6d9560de24fe112d Signed-off-by: Yair Schwarzbaum Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7015 Tested-by: Arm Jenkins Reviewed-by: Giorgio Arena Comments-Addressed: Arm Jenkins --- src/cpu/ICpuKernel.h | 6 +- src/cpu/kernels/CpuActivationKernel.h | 2 +- src/cpu/kernels/CpuAddKernel.cpp | 4 +- src/cpu/kernels/CpuAddKernel.h | 2 +- src/cpu/kernels/CpuCastKernel.h | 2 +- src/cpu/kernels/CpuCol2ImKernel.h | 2 +- src/cpu/kernels/CpuConcatenateBatchKernel.h | 2 +- src/cpu/kernels/CpuConcatenateDepthKernel.h | 2 +- src/cpu/kernels/CpuConcatenateHeightKernel.h | 2 +- src/cpu/kernels/CpuConcatenateWidthKernel.h | 2 +- .../CpuConvertFullyConnectedWeightsKernel.h | 2 +- .../kernels/CpuConvertQuantizedSignednessKernel.h | 2 +- src/cpu/kernels/CpuCopyKernel.h | 2 +- src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h | 2 +- src/cpu/kernels/CpuDequantizeKernel.h | 2 +- src/cpu/kernels/CpuDirectConv2dKernel.h | 2 +- src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h | 2 +- src/cpu/kernels/CpuDirectConv3dKernel.h | 2 +- src/cpu/kernels/CpuElementwiseKernel.h | 2 +- src/cpu/kernels/CpuElementwiseUnaryKernel.cpp | 2 +- src/cpu/kernels/CpuElementwiseUnaryKernel.h | 2 +- src/cpu/kernels/CpuFillKernel.h | 2 +- src/cpu/kernels/CpuFloorKernel.h | 2 +- src/cpu/kernels/CpuGemmInterleave4x4Kernel.h | 2 +- src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h | 2 +- src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h | 4 +- .../kernels/CpuGemmLowpOffsetContributionKernel.h | 2 +- ...puGemmLowpOffsetContributionOutputStageKernel.h | 2 +- .../CpuGemmLowpQuantizeDownInt32ScaleKernel.h | 2 +- ...antizeDownInt32ToInt16ScaleByFixedPointKernel.h | 2 +- ...uantizeDownInt32ToInt8ScaleByFixedPointKernel.h | 2 +- ...antizeDownInt32ToUint8ScaleByFixedPointKernel.h | 2 +- src/cpu/kernels/CpuGemmMatrixAdditionKernel.h | 2 +- src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h | 2 +- src/cpu/kernels/CpuGemmTranspose1xWKernel.h | 2 +- src/cpu/kernels/CpuIm2ColKernel.h | 2 +- src/cpu/kernels/CpuMulKernel.h | 4 +- src/cpu/kernels/CpuPermuteKernel.h | 2 +- src/cpu/kernels/CpuPool2dKernel.cpp | 6 +- src/cpu/kernels/CpuPool2dKernel.h | 2 +- src/cpu/kernels/CpuQuantizeKernel.h | 2 +- src/cpu/kernels/CpuReshapeKernel.h | 2 +- src/cpu/kernels/CpuScaleKernel.h | 2 +- src/cpu/kernels/CpuSoftmaxKernel.cpp | 256 +++++++-------------- src/cpu/kernels/CpuSoftmaxKernel.h | 36 +-- src/cpu/kernels/CpuSubKernel.h | 2 +- src/cpu/kernels/CpuTransposeKernel.h | 2 +- src/cpu/kernels/CpuWeightsReshapeKernel.h | 2 +- src/cpu/kernels/CpuWinogradConv2dKernel.h | 6 +- .../CpuDepthwiseConv2dAssemblyWrapperKernel.h | 2 +- .../internal/CpuPool2dAssemblyWrapperKernel.h | 2 +- src/cpu/operators/CpuConcatenate.h | 4 +- src/cpu/operators/CpuSoftmax.h | 6 +- 53 files changed, 171 insertions(+), 247 deletions(-) diff --git a/src/cpu/ICpuKernel.h b/src/cpu/ICpuKernel.h index 03aec5c08e..8f4106240d 100644 --- a/src/cpu/ICpuKernel.h +++ b/src/cpu/ICpuKernel.h @@ -37,12 +37,8 @@ enum class KernelSelectionType Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */ }; -using ICpuKernel = arm_compute::ICPPKernel; - template -/* This is a temp name for stage 1 process of adding UT for multi-ISA. -In the next stage NewICpuKernel will be called ICpuKernel again */ -class NewICpuKernel : public ICPPKernel +class ICpuKernel : public ICPPKernel { public: /** Micro-kernel selector diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h index ac974850aa..b0476303f0 100644 --- a/src/cpu/kernels/CpuActivationKernel.h +++ b/src/cpu/kernels/CpuActivationKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Interface for the activation kernel */ -class CpuActivationKernel : public NewICpuKernel +class CpuActivationKernel : public ICpuKernel { private: using ActivationKernelPtr = std::add_pointer::type; diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp index deb7379aea..d06621fae0 100644 --- a/src/cpu/kernels/CpuAddKernel.cpp +++ b/src/cpu/kernels/CpuAddKernel.cpp @@ -214,7 +214,7 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I // Configure kernel window auto win_config = validate_and_configure_window(*src0, *src1, *dst); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - NewICpuKernel::configure(win_config.second); + ICpuKernel::configure(win_config.second); } Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) @@ -231,7 +231,7 @@ void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const Thre { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(NewICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); ARM_COMPUTE_ERROR_ON(tensors.empty()); ARM_COMPUTE_ERROR_ON(_run_method == nullptr); diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h index 93b86de4ae..6638135580 100644 --- a/src/cpu/kernels/CpuAddKernel.h +++ b/src/cpu/kernels/CpuAddKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Interface for the kernel to perform addition between two tensors */ -class CpuAddKernel : public NewICpuKernel +class CpuAddKernel : public ICpuKernel { private: using AddKernelPtr = std::add_pointer::type; diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h index 9aeb537044..7679178fa1 100644 --- a/src/cpu/kernels/CpuCastKernel.h +++ b/src/cpu/kernels/CpuCastKernel.h @@ -37,7 +37,7 @@ namespace kernels * * @note When casting between quantized types the scale and zeroPoint are ignored */ -class CpuCastKernel : public NewICpuKernel +class CpuCastKernel : public ICpuKernel { public: CpuCastKernel() = default; diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h index 43be476b2f..deafcc14df 100644 --- a/src/cpu/kernels/CpuCol2ImKernel.h +++ b/src/cpu/kernels/CpuCol2ImKernel.h @@ -52,7 +52,7 @@ namespace kernels * \end{array} \right) * @f] */ -class CpuCol2ImKernel : public NewICpuKernel +class CpuCol2ImKernel : public ICpuKernel { public: /** Default constructor */ diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h index 2b5946571b..0de68a5d64 100644 --- a/src/cpu/kernels/CpuConcatenateBatchKernel.h +++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h @@ -36,7 +36,7 @@ namespace kernels /** Interface for the batch concatenate kernel. * The input tensor will be concatenated into the output tensor. */ -class CpuConcatenateBatchKernel : public NewICpuKernel +class CpuConcatenateBatchKernel : public ICpuKernel { public: CpuConcatenateBatchKernel() = default; diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h index 90b68d3a06..5a0edb95bb 100644 --- a/src/cpu/kernels/CpuConcatenateDepthKernel.h +++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h @@ -40,7 +40,7 @@ namespace kernels /** Interface for the depth concatenate kernel. * The input tensor will be concatenated into the output tensor. */ -class CpuConcatenateDepthKernel : public NewICpuKernel +class CpuConcatenateDepthKernel : public ICpuKernel { public: CpuConcatenateDepthKernel() = default; diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h index 8ace9809cc..74d5d0c2c3 100644 --- a/src/cpu/kernels/CpuConcatenateHeightKernel.h +++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h @@ -36,7 +36,7 @@ namespace kernels /** Interface for the height concatenate kernel. * The source tensor will be concatenated into the destination tensor. */ -class CpuConcatenateHeightKernel : public NewICpuKernel +class CpuConcatenateHeightKernel : public ICpuKernel { public: CpuConcatenateHeightKernel() = default; diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h index d5f2ef24d6..418bc51b33 100644 --- a/src/cpu/kernels/CpuConcatenateWidthKernel.h +++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h @@ -37,7 +37,7 @@ namespace kernels /** Interface for the width concatenate kernel. * The source tensor will be concatenated into the destination tensor. */ -class CpuConcatenateWidthKernel : public NewICpuKernel +class CpuConcatenateWidthKernel : public ICpuKernel { public: CpuConcatenateWidthKernel() = default; diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h index 001a6fcab0..9a1393323b 100644 --- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h +++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h @@ -41,7 +41,7 @@ namespace kernels * * @note This function assumes the weights are already reshaped (transposed) */ -class CpuConvertFullyConnectedWeightsKernel : public NewICpuKernel +class CpuConvertFullyConnectedWeightsKernel : public ICpuKernel { public: CpuConvertFullyConnectedWeightsKernel() = default; diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h index 9d5ee39126..b5eaf65487 100644 --- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h +++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */ -class CpuConvertQuantizedSignednessKernel : public NewICpuKernel +class CpuConvertQuantizedSignednessKernel : public ICpuKernel { public: CpuConvertQuantizedSignednessKernel() = default; diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h index ee4adeb4eb..c9ef8eba76 100644 --- a/src/cpu/kernels/CpuCopyKernel.h +++ b/src/cpu/kernels/CpuCopyKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Kernel to perform a copy between two tensors */ -class CpuCopyKernel : public NewICpuKernel +class CpuCopyKernel : public ICpuKernel { public: CpuCopyKernel() = default; diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h index eae682bb6d..e23a0fac87 100644 --- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h +++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h @@ -40,7 +40,7 @@ namespace cpu namespace kernels { /** Interface for the kernel to run a depthwise convolution native on a tensor. */ -class CpuDepthwiseConv2dNativeKernel : public NewICpuKernel +class CpuDepthwiseConv2dNativeKernel : public ICpuKernel { public: CpuDepthwiseConv2dNativeKernel() = default; diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h index 834c039a76..cfa991dc74 100644 --- a/src/cpu/kernels/CpuDequantizeKernel.h +++ b/src/cpu/kernels/CpuDequantizeKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Interface for the dequantization layer kernel. */ -class CpuDequantizeKernel : public NewICpuKernel +class CpuDequantizeKernel : public ICpuKernel { public: CpuDequantizeKernel() = default; diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h index 09fa5898cc..6ec4d4ee04 100644 --- a/src/cpu/kernels/CpuDirectConv2dKernel.h +++ b/src/cpu/kernels/CpuDirectConv2dKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Interface for the kernel to perform Direct Convolution Layer. */ -class CpuDirectConv2dKernel : public NewICpuKernel +class CpuDirectConv2dKernel : public ICpuKernel { public: CpuDirectConv2dKernel() = default; diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h index 95011f79aa..d3ef17b7c9 100644 --- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h +++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h @@ -40,7 +40,7 @@ namespace kernels * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part * of the @ref DirectConvolutionLayerOutputStageKernelInfo. */ -class CpuDirectConv2dOutputStageKernel : public NewICpuKernel +class CpuDirectConv2dOutputStageKernel : public ICpuKernel { public: CpuDirectConv2dOutputStageKernel() = default; diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.h b/src/cpu/kernels/CpuDirectConv3dKernel.h index 6ae70bd3b7..688f368b9f 100644 --- a/src/cpu/kernels/CpuDirectConv3dKernel.h +++ b/src/cpu/kernels/CpuDirectConv3dKernel.h @@ -35,7 +35,7 @@ namespace cpu namespace kernels { /** Interface for the kernel to perform 3D Direct Convolution Layer. */ -class CpuDirectConv3dKernel : public NewICpuKernel +class CpuDirectConv3dKernel : public ICpuKernel { private: /* Template function for convolution 3d NDHWC */ diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h index bb081cbec1..8cd5d58a96 100644 --- a/src/cpu/kernels/CpuElementwiseKernel.h +++ b/src/cpu/kernels/CpuElementwiseKernel.h @@ -39,7 +39,7 @@ namespace kernels * @f[ dst(x,y) = OP(src0(x,y), src1(x,y))@f] * */ -class CpuElementwiseKernel : public NewICpuKernel +class CpuElementwiseKernel : public ICpuKernel { public: CpuElementwiseKernel() = default; diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp index 79c4896924..e8211fe93e 100644 --- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp @@ -108,7 +108,7 @@ void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo auto shape_and_window = compute_output_shape_and_window(src.tensor_shape()); auto_init_if_empty(dst, shape_and_window.first, 1, src.data_type()); - NewICpuKernel::configure(shape_and_window.second); + ICpuKernel::configure(shape_and_window.second); } Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst) diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h index c520b89618..138049a60c 100644 --- a/src/cpu/kernels/CpuElementwiseUnaryKernel.h +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h @@ -39,7 +39,7 @@ namespace kernels * Element-wise operation is computed by: * @f[ dst(x) = OP(src(x))@f] */ -class CpuElementwiseUnaryKernel : public NewICpuKernel +class CpuElementwiseUnaryKernel : public ICpuKernel { private: using ElementwiseUnaryUkernelPtr = std::add_pointer::type; diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h index 5262ecc5c6..ce41afc462 100644 --- a/src/cpu/kernels/CpuFillKernel.h +++ b/src/cpu/kernels/CpuFillKernel.h @@ -35,7 +35,7 @@ namespace cpu namespace kernels { /** Kernel for filling a tensor with a given constant value */ -class CpuFillKernel : public NewICpuKernel +class CpuFillKernel : public ICpuKernel { public: CpuFillKernel() = default; diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h index 2b102a0515..35ab534ca8 100644 --- a/src/cpu/kernels/CpuFloorKernel.h +++ b/src/cpu/kernels/CpuFloorKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Cpu accelarated kernel to perform a floor operation */ -class CpuFloorKernel : public NewICpuKernel +class CpuFloorKernel : public ICpuKernel { private: using FloorKernelPtr = std::add_pointer::type; diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h index 13b46142c4..4fb6a52a8b 100644 --- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h +++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h @@ -52,7 +52,7 @@ namespace kernels * * After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] */ -class CpuGemmInterleave4x4Kernel : public NewICpuKernel +class CpuGemmInterleave4x4Kernel : public ICpuKernel { public: CpuGemmInterleave4x4Kernel() = default; diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h index 6d06f12e54..2cc789d6d9 100644 --- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h +++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h @@ -43,7 +43,7 @@ namespace kernels * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 * */ -class CpuGemmLowpMatrixMultiplyKernel : public NewICpuKernel +class CpuGemmLowpMatrixMultiplyKernel : public ICpuKernel { public: /** Default constructor */ diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h index 6cced66b47..e469629cdb 100644 --- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h +++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h @@ -40,7 +40,7 @@ namespace kernels * @note This stage is needed to handle the offset of matrix product * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md */ -class CpuGemmLowpMatrixAReductionKernel : public NewICpuKernel +class CpuGemmLowpMatrixAReductionKernel : public ICpuKernel { public: /** Default constructor */ @@ -98,7 +98,7 @@ private: * @note This stage is needed to handle the offset of matrix product * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md */ -class CpuGemmLowpMatrixBReductionKernel : public NewICpuKernel +class CpuGemmLowpMatrixBReductionKernel : public ICpuKernel { public: /** Default constructor */ diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h index 1d70c0619e..3514ca811d 100644 --- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h @@ -46,7 +46,7 @@ namespace kernels * (a_offset * b_offset * k) * */ -class CpuGemmLowpOffsetContributionKernel : public NewICpuKernel +class CpuGemmLowpOffsetContributionKernel : public ICpuKernel { public: /** Default constructor */ diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h index 13c64f4631..ad8b05e49a 100644 --- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h @@ -63,7 +63,7 @@ namespace kernels * (a_offset * b_offset * k) */ -class CpuGemmLowpOffsetContributionOutputStageKernel : public NewICpuKernel +class CpuGemmLowpOffsetContributionOutputStageKernel : public ICpuKernel { public: /** Default constructor */ diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h index f6e8c816f3..c7813edcd7 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h @@ -51,7 +51,7 @@ namespace kernels * -# -to the [-128..127] range and cast to QASYMM8_SIGNED. * */ -class CpuGemmLowpQuantizeDownInt32ScaleKernel : public NewICpuKernel +class CpuGemmLowpQuantizeDownInt32ScaleKernel : public ICpuKernel { public: CpuGemmLowpQuantizeDownInt32ScaleKernel() = default; diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h index a9e2560657..681d099695 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h @@ -48,7 +48,7 @@ namespace kernels * -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16. * */ -class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public NewICpuKernel +class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel { public: CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default; diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h index bfac8681a5..3e615b935e 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h @@ -49,7 +49,7 @@ namespace kernels * -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED. * */ -class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public NewICpuKernel +class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel { public: CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default; diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h index 5e5683cfc3..b773fdfdcf 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h @@ -49,7 +49,7 @@ namespace kernels * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. * */ -class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public NewICpuKernel +class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel { public: CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default; diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h index 64338259e9..4a748218d1 100644 --- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h +++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h @@ -41,7 +41,7 @@ namespace kernels * - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref CpuGemmMatrixMultiplyKernel * - MTX_1 = C */ -class CpuGemmMatrixAdditionKernel : public NewICpuKernel +class CpuGemmMatrixAdditionKernel : public ICpuKernel { public: CpuGemmMatrixAdditionKernel() = default; diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h index 757b46e9a7..9c3dc8b1a0 100644 --- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h +++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h @@ -39,7 +39,7 @@ namespace kernels * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p lhs is a vector and the second input tensor @p rhs a matrix. The implementation also assumes that both tensors have not been reshaped * */ -class CpuGemmMatrixMultiplyKernel : public NewICpuKernel +class CpuGemmMatrixMultiplyKernel : public ICpuKernel { public: CpuGemmMatrixMultiplyKernel() = default; diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h index 2acda35947..0ca92641b7 100644 --- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h +++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h @@ -68,7 +68,7 @@ namespace kernels * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * */ -class CpuGemmTranspose1xWKernel : public NewICpuKernel +class CpuGemmTranspose1xWKernel : public ICpuKernel { public: CpuGemmTranspose1xWKernel() = default; diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h index d789adef95..8160310da6 100644 --- a/src/cpu/kernels/CpuIm2ColKernel.h +++ b/src/cpu/kernels/CpuIm2ColKernel.h @@ -58,7 +58,7 @@ namespace kernels * \end{array} \right) * @f] */ -class CpuIm2ColKernel : public NewICpuKernel +class CpuIm2ColKernel : public ICpuKernel { public: /** Default constructor */ diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h index 3ab198510f..85fcf88a96 100644 --- a/src/cpu/kernels/CpuMulKernel.h +++ b/src/cpu/kernels/CpuMulKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Interface for the kernel to perform multiplication between two tensors */ -class CpuMulKernel : public NewICpuKernel +class CpuMulKernel : public ICpuKernel { public: CpuMulKernel() = default; @@ -118,7 +118,7 @@ private: }; /** Interface for the complex pixelwise multiplication kernel. */ -class CpuComplexMulKernel : public NewICpuKernel +class CpuComplexMulKernel : public ICpuKernel { public: CpuComplexMulKernel() = default; diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h index aae28582b1..9e1b93318e 100644 --- a/src/cpu/kernels/CpuPermuteKernel.h +++ b/src/cpu/kernels/CpuPermuteKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Kernel to perform tensor permutation given a permutation vector */ -class CpuPermuteKernel : public NewICpuKernel +class CpuPermuteKernel : public ICpuKernel { public: CpuPermuteKernel() = default; diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp index 953a9ffb67..d0ca2d285d 100644 --- a/src/cpu/kernels/CpuPool2dKernel.cpp +++ b/src/cpu/kernels/CpuPool2dKernel.cpp @@ -315,7 +315,7 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin { // Configure kernel window Window win = calculate_max_window(*dst, Steps()); - NewICpuKernel::configure(win); + ICpuKernel::configure(win); } else { @@ -323,7 +323,7 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration, pool_size.x(), pool_size.y()); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - NewICpuKernel::configure(win_config.second); + ICpuKernel::configure(win_config.second); } } @@ -356,7 +356,7 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(NewICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h index 7fd3247d6d..c952ea839d 100644 --- a/src/cpu/kernels/CpuPool2dKernel.h +++ b/src/cpu/kernels/CpuPool2dKernel.h @@ -35,7 +35,7 @@ namespace cpu namespace kernels { /** Interface for the pooling layer kernel */ -class CpuPool2dKernel : public NewICpuKernel +class CpuPool2dKernel : public ICpuKernel { private: using PoolingKernelPtr = std::add_pointer::type; diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h index 709e1c89c7..28690bea54 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.h +++ b/src/cpu/kernels/CpuQuantizeKernel.h @@ -37,7 +37,7 @@ namespace kernels * * @note The implementation supports only 3D input tensors */ -class CpuQuantizeKernel : public NewICpuKernel +class CpuQuantizeKernel : public ICpuKernel { public: CpuQuantizeKernel() = default; diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h index 6a5c528ecd..17302c6731 100644 --- a/src/cpu/kernels/CpuReshapeKernel.h +++ b/src/cpu/kernels/CpuReshapeKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Interface for the kernel to perform tensor reshaping */ -class CpuReshapeKernel : public NewICpuKernel +class CpuReshapeKernel : public ICpuKernel { public: CpuReshapeKernel() = default; diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h index 94bbdb72a0..e0e9e387bd 100644 --- a/src/cpu/kernels/CpuScaleKernel.h +++ b/src/cpu/kernels/CpuScaleKernel.h @@ -35,7 +35,7 @@ namespace cpu namespace kernels { /** Arm(R) Neon(TM) kernel to perform scaling on a tensor */ -class CpuScaleKernel : public NewICpuKernel +class CpuScaleKernel : public ICpuKernel { private: /** Scale function to use for the particular function to use */ diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp index 054adfa23c..6766b10120 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.cpp +++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "src/cpu/kernels/CpuSoftmaxKernel.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" @@ -30,12 +29,10 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "src/core/CPP/Validate.h" +#include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" #include "src/cpu/kernels/softmax/list.h" - namespace arm_compute { namespace cpu @@ -44,164 +41,60 @@ namespace kernels { namespace { -struct SoftmaxSelectorData -{ - DataType dt; - const CPUInfo &ci; -}; -using SoftmaxSelectorPtr = std::add_pointer::type; -using SoftmaxLogits1DMaxKernelPtr = std::add_pointer::type; -using SoftmaxLogits1DKernelPtr = std::add_pointer::type; - -struct SoftmaxLogits1DKernel -{ - const char *name; - const SoftmaxSelectorPtr is_selected; - SoftmaxLogits1DKernelPtr ukernel; -}; - -struct SoftmaxLogits1DMaxKernel -{ - const char *name; - const SoftmaxSelectorPtr is_selected; - SoftmaxLogits1DMaxKernelPtr ukernel; -}; - -static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = -{ -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp32_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); }, - REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_softmax) - }, - { - "sve_fp16_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); }, - REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_softmax) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ - -#if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_fp32_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_softmax) - }, -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_softmax) - }, -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ - -#if defined(ARM_COMPUTE_ENABLE_SVE2) - { - "sve2_qu8_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_softmax) - }, - { - "sve2_qs8_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_softmax) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_qu8_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax) - }, - { - "neon_qs8_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax) - }, -#endif //defined(ARM_COMPUTE_ENABLE_NEON) -}; - -static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] = +/* Softmax Logits 1D Max - identifying the max value of 1D Logits */ +static const std::vector available_kernels_max_logits = { #if defined(ARM_COMPUTE_ENABLE_SVE) { "sve_fp32_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); }, + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; }, REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_logits) }, { "sve_fp16_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); }, + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve; }, REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_logits) }, { "sve_qu8_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); }, + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; }, REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_qasymm8_logits) }, { "sve_qs8_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); }, + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; }, REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_qasymm8_signed_logits) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { "neon_fp32_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_logits) }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "neon_fp16_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); }, REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_logits) }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ { "neon_qu8_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); }, + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_logits) }, { "neon_qs8_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_singed_logits) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ }; - -const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data) -{ - for(const auto &uk : available_logits_1d_kernels) - { - if(uk.is_selected({ data.dt, CPUInfo::get() })) - { - return &uk; - } - } - return nullptr; -} - -const SoftmaxLogits1DMaxKernel *get_implementation_logits_max(const SoftmaxSelectorData &data) -{ - for(const auto &uk : available_logits_1d_max_kernels) - { - if(uk.is_selected({ data.dt, CPUInfo::get() })) - { - return &uk; - } - } - return nullptr; -} - Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - // Validate in case of configured output if(output.total_size() != 0) { @@ -209,58 +102,104 @@ Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorI ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1)); } - return Status{}; } - -} // namespace - +} //namespace +const std::vector &CpuLogits1DMaxKernel::get_available_kernels() +{ + return available_kernels_max_logits; +} void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst)); - // Softmax across the x dimension const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1); // Output auto initialization if not yet initialized auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); - - const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() }); + const auto *uk = get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); - _run_method = uk->ukernel; _name = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name); - - Window win = calculate_max_window(*src, Steps()); + Window win = calculate_max_window(*src, Steps()); ICpuKernel::configure(win); } - Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst)); - return Status{}; } - void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - _run_method(src, dst, window); } - const char *CpuLogits1DMaxKernel::name() const { return _name.c_str(); } +/* Softmax Logits 1D - computation for QASYMM8 with pre-computed max. */ +template +static const std::vector::SoftmaxLogits1DKernel> available_kernels_logits = +{ +#if defined(ARM_COMPUTE_ENABLE_SVE) + { + "sve_fp32_softmax_logits_1d", + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; }, + REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_softmax) + }, + { + "sve_fp16_softmax_logits_1d", + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve; }, + REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_softmax) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "neon_fp32_softmax_logits_1d", + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_softmax) + }, +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_softmax_logits_1d", + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_softmax) + }, +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) + { + "sve2_qu8_softmax_logits_1d", + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_softmax) + }, + { + "sve2_qs8_softmax_logits_1d", + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_softmax) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "neon_qu8_softmax_logits_1d", + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax) + }, + { + "neon_qs8_softmax_logits_1d", + [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax) + }, +#endif //defined(ARM_COMPUTE_ENABLE_NEON) +}; namespace { Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max, @@ -270,14 +209,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn // Check input ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); - // Check max ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max); - // Check output if configured if(dst.total_size() != 0) { @@ -286,7 +222,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization); } - // Check tmp if configured if(tmp.total_size() != 0) { @@ -296,84 +231,69 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn // on the maximum number of threads that will run in parallel. ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp); } - return Status{}; } } // namespace - +template +const std::vector::SoftmaxLogits1DKernel> &CpuLogits1DSoftmaxKernel::get_available_kernels() +{ + return available_kernels_logits; +} template void CpuLogits1DSoftmaxKernel::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); - // Configure kernel window const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); - // Output auto initialization if not yet initialized const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info(); auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding()); - // Tmp auto initialization if not yet initialized const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); - - const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() }); + const auto *uk = CpuLogits1DSoftmaxKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); - std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); - - _beta = beta; - _run_method = uk->ukernel; - _name = kernel_name.append("/").append(uk->name); - + _beta = beta; + _run_method = uk->ukernel; + _name = kernel_name.append("/").append(uk->name); // Configure kernel window Window win = calculate_max_window(*max, Steps()); - - ICpuKernel::configure(win); + ICPPKernel::configure(win); } - template Status CpuLogits1DSoftmaxKernel::validate(const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); - return Status{}; } - template void CpuLogits1DSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto max = tensors.get_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); - + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto max = tensors.get_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST_0); + auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; - ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); - void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window); } - template const char *CpuLogits1DSoftmaxKernel::name() const { return _name.c_str(); } - template class CpuLogits1DSoftmaxKernel; template class CpuLogits1DSoftmaxKernel; - } // namespace kernels } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h index f317662620..df7d3f7d9b 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.h +++ b/src/cpu/kernels/CpuSoftmaxKernel.h @@ -23,10 +23,8 @@ */ #ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H #define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H - #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" - namespace arm_compute { namespace cpu @@ -34,8 +32,11 @@ namespace cpu namespace kernels { /** Interface for the identifying the max value of 1D Logits */ -class CpuLogits1DMaxKernel : public NewICpuKernel +class CpuLogits1DMaxKernel : public ICpuKernel { +private: + using SoftmaxLogits1DMaxKernelPtr = std::add_pointer::type; + public: CpuLogits1DMaxKernel() = default; ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel); @@ -52,27 +53,31 @@ public: * @return a status */ static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; - -private: - using SoftmaxLogits1DMaxKernelPtr = std::add_pointer::type; + struct SoftmaxLogits1DMaxKernel + { + const char *name; + const DataTypeISASelectorPtr is_selected; + SoftmaxLogits1DMaxKernelPtr ukernel; + }; + static const std::vector &get_available_kernels(); private: SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr }; std::string _name{}; }; - /** Interface for softmax computation for QASYMM8 with pre-computed max. */ template -class CpuLogits1DSoftmaxKernel : public NewICpuKernel> +class CpuLogits1DSoftmaxKernel : public ICpuKernel> { +private: + using SoftmaxLogits1DKernelPtr = std::add_pointer::type; + public: CpuLogits1DSoftmaxKernel() = default; ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel); - /** Set the input and output tensors. * * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -92,13 +97,16 @@ public: */ static Status validate(const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp); - // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; - -private: - using SoftmaxLogits1DKernelPtr = std::add_pointer::type; + struct SoftmaxLogits1DKernel + { + const char *name; + const DataTypeISASelectorPtr is_selected; + SoftmaxLogits1DKernelPtr ukernel; + }; + static const std::vector &get_available_kernels(); private: float _beta{ 1.0f }; diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h index 70f34b1b57..323a3f1316 100644 --- a/src/cpu/kernels/CpuSubKernel.h +++ b/src/cpu/kernels/CpuSubKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Interface for the kernel to perform subtraction between two tensors */ -class CpuSubKernel : public NewICpuKernel +class CpuSubKernel : public ICpuKernel { private: using SubKernelPtr = std::add_pointer::type; diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h index 7e1ee5f73d..cb85daeb40 100644 --- a/src/cpu/kernels/CpuTransposeKernel.h +++ b/src/cpu/kernels/CpuTransposeKernel.h @@ -34,7 +34,7 @@ namespace cpu namespace kernels { /** Kernel which transposes the elements of a matrix */ -class CpuTransposeKernel : public NewICpuKernel +class CpuTransposeKernel : public ICpuKernel { public: CpuTransposeKernel() = default; diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h index 6c2d7ef6f9..1a260edc96 100644 --- a/src/cpu/kernels/CpuWeightsReshapeKernel.h +++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h @@ -56,7 +56,7 @@ namespace kernels * \end{array} \right) * @f] */ -class CpuWeightsReshapeKernel : public NewICpuKernel +class CpuWeightsReshapeKernel : public ICpuKernel { public: /** Default constructor */ diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h index 0c4e28c394..6909216d94 100644 --- a/src/cpu/kernels/CpuWinogradConv2dKernel.h +++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h @@ -35,7 +35,7 @@ namespace arm_compute namespace cpu { /** Interface for the kernel to perform Winograd input transform. */ -class ICpuWinogradConv2dTransformInputKernel : public NewICpuKernel +class ICpuWinogradConv2dTransformInputKernel : public ICpuKernel { public: /** Get the working space required to perform the transformation. @@ -216,7 +216,7 @@ private: }; /** Interface for the kernel to perform Winograd output transform. */ -class ICpuWinogradConv2dTransformOutputKernel : public NewICpuKernel +class ICpuWinogradConv2dTransformOutputKernel : public ICpuKernel { public: /** Get the working space required to perform the transformation. @@ -418,7 +418,7 @@ private: }; /** Interface for the kernel to perform Winograd weights transform. */ -class ICpuWinogradConv2dTransformWeightsKernel : public NewICpuKernel +class ICpuWinogradConv2dTransformWeightsKernel : public ICpuKernel { public: /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h index ea51d5d54d..a32a7a3ec8 100644 --- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h @@ -45,7 +45,7 @@ namespace cpu namespace kernels { /** This class is a wrapper for the depthwise convolution assembly kernels. */ -class CpuDepthwiseConv2dAssemblyWrapperKernel final : public NewICpuKernel +class CpuDepthwiseConv2dAssemblyWrapperKernel final : public ICpuKernel { public: /** Default constructor */ diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h index daa3168beb..8713d5c54d 100644 --- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h +++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h @@ -46,7 +46,7 @@ namespace kernels * execute a single assembly kernel in the context of an NEFunction. * */ -class CpuPool2dAssemblyWrapperKernel final : public NewICpuKernel +class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel { public: /** Constructor diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h index 001ac68162..eb11926b48 100644 --- a/src/cpu/operators/CpuConcatenate.h +++ b/src/cpu/operators/CpuConcatenate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -67,7 +67,7 @@ public: void run(ITensorPack &tensors) override; private: - std::vector> _concat_kernels{}; + std::vector> _concat_kernels{}; unsigned int _num_srcs{ 0 }; unsigned int _axis{ 0 }; }; diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h index 20f3f006d3..64df8704f9 100644 --- a/src/cpu/operators/CpuSoftmax.h +++ b/src/cpu/operators/CpuSoftmax.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -92,8 +92,8 @@ private: CpuPermute _permute_input; CpuPermute _permute_output; - std::unique_ptr _max_kernel; - std::unique_ptr _softmax_kernel; + std::unique_ptr _max_kernel; + std::unique_ptr _softmax_kernel; TensorInfo _max; TensorInfo _tmp; -- cgit v1.2.1